Source code for mechanize._html

from __future__ import absolute_import

import codecs
import copy
import re

from ._form import parse_forms
from ._headersutil import is_html as _is_html
from ._headersutil import split_header_words
from ._rfc3986 import clean_url, urljoin
from .polyglot import is_string

DEFAULT_ENCODING = "utf-8"
_encoding_pats = (
    # XML declaration
    r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
    # HTML 5 charset
    r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''',
    # HTML 4 Pragma directive
    r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''',
)


def compile_pats(binary):
    for raw in _encoding_pats:
        if binary:
            raw = raw.encode('ascii')
        yield re.compile(raw, flags=re.IGNORECASE)


class LazyEncodingPats(object):

    def __call__(self, binary=False):
        attr = 'binary_pats' if binary else 'unicode_pats'
        pats = getattr(self, attr, None)
        if pats is None:
            pats = tuple(compile_pats(binary))
            setattr(self, attr, pats)
        for pat in pats:
            yield pat


lazy_encoding_pats = LazyEncodingPats()


def find_declared_encoding(raw, limit=50*1024):
    prefix = raw[:limit]
    is_binary = isinstance(raw, bytes)
    for pat in lazy_encoding_pats(is_binary):
        m = pat.search(prefix)
        if m is not None:
            ans = m.group(1)
            if is_binary:
                ans = ans.decode('ascii', 'replace')
                return ans


def elem_text(elem):
    if elem.text:
        yield elem.text
    for child in elem:
        for text in elem_text(child):
            yield text
        if child.tail:
            yield child.tail


def iterlinks(root, base_url):
    link_tags = {"a": "href", "area": "href", "iframe": "src"}
    for tag in root.iter('*'):
        if not is_string(tag.tag):
            continue
        q = tag.tag.lower()
        attr = link_tags.get(q)
        if attr is not None:
            val = tag.get(attr)
            if val:
                url = clean_url(val)
                yield Link(base_url, url,
                           compress_whitespace(u''.join(elem_text(tag))), q,
                           tag.items())
        elif q == 'base':
            href = tag.get('href')
            if href:
                base_url = href


def compress_whitespace(text):
    return re.sub(r'\s+', ' ', text or '').strip()


def get_encoding_from_response(response, verify=True):
    # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
    # headers may be in the response.  HTTP-EQUIV headers come last,
    # so try in order from first to last.
    if response:
        for ct in response.info().getheaders("content-type"):
            for k, v in split_header_words([ct])[0]:
                if k == "charset":
                    if not verify:
                        return v
                    try:
                        codecs.lookup(v)
                        return v
                    except LookupError:
                        continue


class EncodingFinder:
    def __init__(self, default_encoding):
        self._default_encoding = default_encoding

    def encoding(self, response):
        return get_encoding_from_response(response) or self._default_encoding


class ResponseTypeFinder:
    def __init__(self, allow_xhtml):
        self._allow_xhtml = allow_xhtml

    def is_html(self, response, encoding):
        ct_hdrs = response.info().getheaders("content-type")
        url = response.geturl()
        # XXX encoding
        return _is_html(ct_hdrs, url, self._allow_xhtml)





[docs]def content_parser(data, url=None, response_info=None, transport_encoding=None, default_encoding=DEFAULT_ENCODING, is_html=True): ''' Parse data (a bytes object) into an etree representation such as :py:mod:`xml.etree.ElementTree` or `lxml.etree` :param bytes data: The data to parse :param url: The URL of the document being parsed or None :param response_info: Information about the document (contains all HTTP headers as :class:`HTTPMessage`) :param transport_encoding: The character encoding for the document being parsed as specified in the HTTP headers or None. :param default_encoding: The character encoding to use if no encoding could be detected and no transport_encoding is specified :param is_html: If the document is to be parsed as HTML. ''' if not is_html: return try: from html5_parser import parse except Exception: from html5lib import parse kw = {'namespaceHTMLElements': False} if transport_encoding and isinstance(data, bytes): kw['transport_encoding'] = transport_encoding return parse(data, **kw) else: return parse(data, transport_encoding=transport_encoding)
def get_title(root): for title in root.iter('title'): text = compress_whitespace(title.text) if text: return text lazy = object() class Factory: """Factory for forms, links, etc. This interface may expand in future. Public methods: set_request_class(request_class) set_response(response) forms() links() Public attributes: Note that accessing these attributes may raise ParseError. encoding: string specifying the encoding of response if it contains a text document (this value is left unspecified for documents that do not have an encoding, e.g. an image file) is_html: true if response contains an HTML document (XHTML may be regarded as HTML too) title: page title, or None if no title or not HTML global_form: form object containing all controls that are not descendants of any FORM element, or None if the forms_factory does not support supplying a global form """ def __init__( self, default_encoding=DEFAULT_ENCODING, allow_xhtml=False, ): """ Pass keyword arguments only. """ self._encoding_finder = EncodingFinder(default_encoding) self.form_encoding = default_encoding self._response_type_finder = ResponseTypeFinder( allow_xhtml=allow_xhtml) self._content_parser = content_parser self._current_forms = self._current_links = self._current_title = lazy self._current_global_form = self._root = lazy self._raw_data = b'' self.is_html, self.encoding = False, DEFAULT_ENCODING self.set_response(None) def set_content_parser(self, val): self._content_parser = val def set_request_class(self, request_class): """Set request class (mechanize.Request by default). HTMLForm instances returned by .forms() will return instances of this class when .click()ed. """ self._request_class = request_class def set_response(self, response): """Set response. The response must either be None or implement the same interface as objects returned by mechanize.urlopen(). """ self._response = copy.copy(response) self._current_forms = self._current_links = self._current_title = lazy self._current_global_form = self._root = lazy self.encoding = self._encoding_finder.encoding(self._response) self.is_html = self._response_type_finder.is_html( self._response, self.encoding) if self._response else False @property def root(self): if self._root is lazy: response = self._response raw = self._response.read() if self._response else b'' default_encoding = self._encoding_finder._default_encoding transport_encoding = get_encoding_from_response(response, verify=False) declared_encoding = find_declared_encoding(raw) self.form_encoding = declared_encoding or transport_encoding or default_encoding self._root = self._content_parser( raw, url=response.geturl() if response else None, response_info=response.info() if response else None, default_encoding=default_encoding, is_html=self.is_html, transport_encoding=transport_encoding) return self._root @property def title(self): if self._current_title is lazy: self._current_title = get_title( self.root) if self.root is not None else None return self._current_title or u'' @property def global_form(self): if self._current_global_form is lazy: self.forms() return self._current_global_form def forms(self): """ Return tuple of HTMLForm-like objects. """ # this implementation sets .global_form as a side-effect if self._current_forms is lazy: self._current_forms, self._current_global_form = self._get_forms() return self._current_forms def links(self): """Return tuple of mechanize.Link-like objects. """ if self._current_links is lazy: self._current_links = self._get_links() return self._get_links() def _get_links(self): if self.root is None: return () return tuple(iterlinks(self.root, self._response.geturl())) def _get_forms(self): if self.root is None: return (), None return parse_forms(self.root, self._response.geturl(), self._request_class, encoding=self.form_encoding)