Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow JSON in is_html #68

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions mechanize/_headersutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,27 @@
import _rfc3986


def is_html_file_extension(url, allow_xhtml):
def is_html_file_extension(url, allow_xhtml, allow_json=False):
ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1]
html_exts = [".htm", ".html"]
if allow_xhtml:
html_exts += [".xhtml"]
if allow_json:
html_exts += [".json"]
return ext in html_exts


def is_html(ct_headers, url, allow_xhtml=False):
def is_html(ct_headers, url, allow_xhtml=False, allow_json=False):
"""
ct_headers: Sequence of Content-Type headers
url: Response URL

"""
if not ct_headers:
return is_html_file_extension(url, allow_xhtml)
return is_html_file_extension(url, allow_xhtml, allow_json)
headers = split_header_words(ct_headers)
if len(headers) < 1:
return is_html_file_extension(url, allow_xhtml)
return is_html_file_extension(url, allow_xhtml, allow_json)
first_header = headers[0]
first_parameter = first_header[0]
ct = first_parameter[0]
Expand All @@ -46,6 +48,10 @@ def is_html(ct_headers, url, allow_xhtml=False):
"text/xhtml", "text/xml",
"application/xml", "application/xhtml+xml",
]
if allow_json:
html_types += [
"application/json",
]
return ct in html_types


Expand Down Expand Up @@ -234,8 +240,8 @@ def parse_ns_headers(ns_headers):


def _test():
import doctest, _headersutil
return doctest.testmod(_headersutil)
import doctest, _headersutil
return doctest.testmod(_headersutil)

if __name__ == "__main__":
_test()
_test()
20 changes: 12 additions & 8 deletions mechanize/_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,14 @@ def encoding(self, response):


class ResponseTypeFinder:
def __init__(self, allow_xhtml):
def __init__(self, allow_xhtml, allow_json=False):
self._allow_xhtml = allow_xhtml
self._allow_json = allow_json
def is_html(self, response, encoding):
ct_hdrs = response.info().getheaders("content-type")
url = response.geturl()
# XXX encoding
return _is_html(ct_hdrs, url, self._allow_xhtml)
return _is_html(ct_hdrs, url, self._allow_xhtml, self._allow_json)


class Args(object):
Expand Down Expand Up @@ -463,8 +464,8 @@ class Factory:
encoding: string specifying the encoding of response if it contains a text
document (this value is left unspecified for documents that do not have
an encoding, e.g. an image file)
is_html: true if response contains an HTML document (XHTML may be
regarded as HTML too)
is_html: true if response contains an HTML document (XHTML and/or JSON may
be regarded as HTML too)
title: page title, or None if no title or not HTML
global_form: form object containing all controls that are not descendants
of any FORM element, or None if the forms_factory does not support
Expand All @@ -476,7 +477,7 @@ class Factory:

def __init__(self, forms_factory, links_factory, title_factory,
encoding_finder=EncodingFinder(DEFAULT_ENCODING),
response_type_finder=ResponseTypeFinder(allow_xhtml=False),
response_type_finder=ResponseTypeFinder(allow_xhtml=False, allow_json=False),
):
"""

Expand Down Expand Up @@ -578,14 +579,15 @@ def links(self):

class DefaultFactory(Factory):
"""Based on sgmllib."""
def __init__(self, i_want_broken_xhtml_support=False):
def __init__(self, i_want_broken_xhtml_support=False, i_want_broken_json_support=False):
Factory.__init__(
self,
forms_factory=FormsFactory(),
links_factory=LinksFactory(),
title_factory=TitleFactory(),
response_type_finder=ResponseTypeFinder(
allow_xhtml=i_want_broken_xhtml_support),
allow_xhtml=i_want_broken_xhtml_support,
allow_json=i_want_broken_json_support),
)

def set_response(self, response):
Expand All @@ -604,14 +606,16 @@ class RobustFactory(Factory):

"""
def __init__(self, i_want_broken_xhtml_support=False,
i_want_broken_json_support=False,
soup_class=None):
Factory.__init__(
self,
forms_factory=RobustFormsFactory(),
links_factory=RobustLinksFactory(),
title_factory=RobustTitleFactory(),
response_type_finder=ResponseTypeFinder(
allow_xhtml=i_want_broken_xhtml_support),
allow_xhtml=i_want_broken_xhtml_support,
allow_json=i_want_broken_json_support),
)
if soup_class is None:
soup_class = MechanizeBs
Expand Down
4 changes: 3 additions & 1 deletion mechanize/_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,17 +184,19 @@ class HTTPEquivProcessor(BaseHandler):

def __init__(self, head_parser_class=HeadParser,
i_want_broken_xhtml_support=False,
i_want_broken_json_support=False
):
self.head_parser_class = head_parser_class
self._allow_xhtml = i_want_broken_xhtml_support
self._allow_json = i_want_broken_json_support

def http_response(self, request, response):
if not hasattr(response, "seek"):
response = response_seek_wrapper(response)
http_message = response.info()
url = response.geturl()
ct_hdrs = http_message.getheaders("content-type")
if is_html(ct_hdrs, url, self._allow_xhtml):
if is_html(ct_hdrs, url, self._allow_xhtml, self._allow_json):
try:
try:
html_headers = parse_head(response,
Expand Down