""":mod:`pandas.io.html` is a module containing functionality for dealing with
HTML IO.
"""
from distutils.version import LooseVersion
import numbers
import os
import re
import pandas.compat as compat
from pandas.compat import (
binary_type, iteritems, lmap, lrange, raise_with_traceback, string_types,
u)
from pandas.errors import AbstractMethodError, EmptyDataError
from pandas.core.dtypes.common import is_list_like
from pandas import Series
from pandas.io.common import _is_url, _validate_header_arg, urlopen
from pandas.io.formats.printing import pprint_thing
from pandas.io.parsers import TextParser
_IMPORTS = False
_HAS_BS4 = False
_HAS_LXML = False
_HAS_HTML5LIB = False
def _importers():
# import things we need
# but make this done on a first use basis
global _IMPORTS
if _IMPORTS:
return
global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB
try:
import bs4 # noqa
_HAS_BS4 = True
except ImportError:
pass
try:
import lxml # noqa
_HAS_LXML = True
except ImportError:
pass
try:
import html5lib # noqa
_HAS_HTML5LIB = True
except ImportError:
pass
_IMPORTS = True
#############
# READ HTML #
#############
_RE_WHITESPACE = re.compile(r'[\r\n]+|\s{2,}')
char_types = string_types + (binary_type,)
def _remove_whitespace(s, regex=_RE_WHITESPACE):
"""Replace extra whitespace inside of a string with a single space.
Parameters
----------
s : str or unicode
The string from which to remove extra whitespace.
regex : regex
The regular expression to use to remove extra whitespace.
Returns
-------
subd : str or unicode
`s` with all extra whitespace replaced with a single space.
"""
return regex.sub(' ', s.strip())
def _get_skiprows(skiprows):
"""Get an iterator given an integer, slice or container.
Parameters
----------
skiprows : int, slice, container
The iterator to use to skip rows; can also be a slice.
Raises
------
TypeError
* If `skiprows` is not a slice, integer, or Container
Returns
-------
it : iterable
A proper iterator to use to skip rows of a DataFrame.
"""
if isinstance(skiprows, slice):
return lrange(skiprows.start or 0, skiprows.stop, skiprows.step or 1)
elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):
return skiprows
elif skiprows is None:
return 0
raise TypeError('%r is not a valid type for skipping rows' %
type(skiprows).__name__)
def _read(obj):
"""Try to read from a url, file or string.
Parameters
----------
obj : str, unicode, or file-like
Returns
-------
raw_text : str
"""
if _is_url(obj):
with urlopen(obj) as url:
text = url.read()
elif hasattr(obj, 'read'):
text = obj.read()
elif isinstance(obj, char_types):
text = obj
try:
if os.path.isfile(text):
with open(text, 'rb') as f:
return f.read()
except (TypeError, ValueError):
pass
else:
raise TypeError("Cannot read object of type %r" % type(obj).__name__)
return text
class _HtmlFrameParser(object):
"""Base class for parsers that parse HTML into DataFrames.
Parameters
----------
io : str or file-like
This can be either a string of raw HTML, a valid URL using the HTTP,
FTP, or FILE protocols or a file-like object.
match : str or regex
The text to match in the document.
attrs : dict
List of HTML <table> element attributes to match.
encoding : str
Encoding to be used by parser
displayed_only : bool
Whether or not items with "display:none" should be ignored
.. versionadded:: 0.23.0
Attributes
----------
io : str or file-like
raw HTML, URL, or file-like object
match : regex
The text to match in the raw HTML
attrs : dict-like
A dictionary of valid table attributes to use to search for table
elements.
encoding : str
Encoding to be used by parser
displayed_only : bool
Whether or not items with "display:none" should be ignored
.. versionadded:: 0.23.0
Notes
-----
To subclass this class effectively you must override the following methods:
* :func:`_build_doc`
* :func:`_attr_getter`
* :func:`_text_getter`
* :func:`_parse_td`
* :func:`_parse_thead_tr`
* :func:`_parse_tbody_tr`
* :func:`_parse_tfoot_tr`
* :func:`_parse_tables`
* :func:`_equals_tag`
See each method's respective documentation for details on their
functionality.
"""
def __init__(self, io, match, attrs, encoding, displayed_only):
self.io = io
self.match = match
self.attrs = attrs
self.encoding = encoding
self.displayed_only = displayed_only
def parse_tables(self):
"""
Parse and return all tables from the DOM.
Returns
-------
list of parsed (header, body, footer) tuples from tables.
"""
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
return (self._parse_thead_tbody_tfoot(table) for table in tables)
def _attr_getter(self, obj, attr):
"""
Return the attribute value of an individual DOM node.
Parameters
----------
obj : node-like
A DOM node.
attr : str or unicode
The attribute, such as "colspan"
Returns
-------
str or unicode
The attribute value.
"""
# Both lxml and BeautifulSoup have the same implementation:
return obj.get(attr)
def _text_getter(self, obj):
"""
Return the text of an individual DOM node.
Parameters
----------
obj : node-like
A DOM node.
Returns
-------
text : str or unicode
The text from an individual DOM node.
"""
raise AbstractMethodError(self)
def _parse_td(self, obj):
"""Return the td elements from a row element.
Parameters
----------
obj : node-like
A DOM <tr> node.
Returns
-------
list of node-like
These are the elements of each row, i.e., the columns.
"""
raise AbstractMethodError(self)
def _parse_thead_tr(self, table):
"""
Return the list of thead row elements from the parsed table element.
Parameters
----------
table : a table element that contains zero or more thead elements.
Returns
-------
list of node-like
These are the <tr> row elements of a table.
"""
raise AbstractMethodError(self)
def _parse_tbody_tr(self, table):
"""
Return the list of tbody row elements from the parsed table element.
HTML5 table bodies consist of either 0 or more <tbody> elements (which
only contain <tr> elements) or 0 or more <tr> elements. This method
checks for both structures.
Parameters
----------
table : a table element that contains row elements.
Returns
-------
list of node-like
These are the <tr> row elements of a table.
"""
raise AbstractMethodError(self)
def _parse_tfoot_tr(self, table):
"""
Return the list of tfoot row elements from the parsed table element.
Parameters
----------
table : a table element that contains row elements.
Returns
-------
list of node-like
These are the <tr> row elements of a table.
"""
raise AbstractMethodError(self)
def _parse_tables(self, doc, match, attrs):
"""
Return all tables from the parsed DOM.
Parameters
----------
doc : the DOM from which to parse the table element.
match : str or regular expression
The text to search for in the DOM tree.
attrs : dict
A dictionary of table attributes that can be used to disambiguate
multiple tables on a page.
Raises
------
ValueError : `match` does not match any text in the document.
Returns
-------
list of node-like
Loading ...