Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Instantiate parsers only once #153

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/parsel.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ parsel.selector
:members:
:undoc-members:
:show-inheritance:
:exclude-members: create_root_node


parsel.utils
Expand Down
36 changes: 36 additions & 0 deletions parsel/parser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from lxml import etree
from lxml.etree import XMLParser as _UnsafeXMLParser
from lxml.html import HTMLParser as _HTMLParser


class _LXMLBaseParser(object):

def __init__(self, parser_cls):
self._parser = parser_cls(recover=True, encoding='utf8')

def parse(self, text, base_url):
body = text.strip().replace('\x00', '').encode('utf8') or b'<html/>'
root = etree.fromstring(body, parser=self._parser, base_url=base_url)
if root is None:
root = etree.fromstring(b'<html/>', parser=self._parser,
base_url=base_url)
return root


class HTMLParser(_LXMLBaseParser):

def __init__(self):
super(HTMLParser, self).__init__(_HTMLParser)


class _XMLParser(_UnsafeXMLParser):

def __init__(self, *args, **kwargs):
kwargs.setdefault('resolve_entities', False)
super(_XMLParser, self).__init__(*args, **kwargs)


class XMLParser(_LXMLBaseParser):

def __init__(self):
super(XMLParser, self).__init__(_XMLParser)
3 changes: 3 additions & 0 deletions parsel/parser/html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from parsel.parser import HTMLParser

HTML_PARSER = HTMLParser()
3 changes: 3 additions & 0 deletions parsel/parser/xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from parsel.parser import XMLParser

XML_PARSER = XMLParser()
39 changes: 34 additions & 5 deletions parsel/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,39 @@
"""

import sys
from importlib import import_module
from warnings import warn

import six
from lxml import etree, html
from lxml import etree

from .utils import flatten, iflatten, extract_regex, shorten
from .csstranslator import HTMLTranslator, GenericTranslator


def _load_object(path):
"""Load an object given its absolute object path, and return it.

`path` can point to a class, function, variable or a class instance. For
example: ``'parsel.parser.html.HTML_PARSER'``.
"""

try:
dot = path.rindex('.')
except ValueError:
raise ValueError("Error loading object '%s': not a full path" % path)

module, name = path[:dot], path[dot+1:]
mod = import_module(module)

try:
obj = getattr(mod, name)
except AttributeError:
raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))

return obj


class CannotRemoveElementWithoutRoot(Exception):
pass

Expand All @@ -21,14 +46,16 @@ class CannotRemoveElementWithoutParent(Exception):

class SafeXMLParser(etree.XMLParser):
def __init__(self, *args, **kwargs):
warn('parsel.selector.SafeXMLParser is deprecated',
DeprecationWarning, stacklevel=2)
kwargs.setdefault('resolve_entities', False)
super(SafeXMLParser, self).__init__(*args, **kwargs)

_ctgroup = {
'html': {'_parser': html.HTMLParser,
'html': {'_parser': 'parsel.parser.html.HTML_PARSER',
'_csstranslator': HTMLTranslator(),
'_tostring_method': 'html'},
'xml': {'_parser': SafeXMLParser,
'xml': {'_parser': 'parsel.parser.xml.XML_PARSER',
'_csstranslator': GenericTranslator(),
'_tostring_method': 'xml'},
}
Expand All @@ -46,6 +73,8 @@ def _st(st):
def create_root_node(text, parser_cls, base_url=None):
"""Create root node for text using given parser class.
"""
warn('parsel.selector.create_root_node is deprecated',
DeprecationWarning, stacklevel=2)
body = text.strip().replace('\x00', '').encode('utf8') or b'<html/>'
parser = parser_cls(recover=True, encoding='utf8')
root = etree.fromstring(body, parser=parser, base_url=base_url)
Expand Down Expand Up @@ -195,7 +224,7 @@ class Selector(object):
def __init__(self, text=None, type=None, namespaces=None, root=None,
base_url=None, _expr=None):
self.type = st = _st(type or self._default_type)
self._parser = _ctgroup[st]['_parser']
self._parser = _load_object(_ctgroup[st]['_parser'])
self._csstranslator = _ctgroup[st]['_csstranslator']
self._tostring_method = _ctgroup[st]['_tostring_method']

Expand All @@ -218,7 +247,7 @@ def __getstate__(self):
raise TypeError("can't pickle Selector objects")

def _get_root(self, text, base_url=None):
return create_root_node(text, self._parser, base_url=base_url)
return self._parser.parse(text=text, base_url=base_url)

def xpath(self, query, namespaces=None, **kwargs):
"""
Expand Down
21 changes: 21 additions & 0 deletions tests/test_deprecations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# -*- coding:utf-8 -*-


from unittest import TestCase
from warnings import catch_warnings

from parsel.selector import create_root_node, SafeXMLParser
from lxml.html import HTMLParser


class TestDeprecations(TestCase):

def test_create_root_node(self):
with catch_warnings(record=True) as warnings:
create_root_node(u'…', HTMLParser)
self.assertEqual(len(warnings), 1)

def test_SafeXMLParser(self):
with catch_warnings(record=True) as warnings:
parser = SafeXMLParser()
self.assertEqual(len(warnings), 1)
22 changes: 22 additions & 0 deletions tests/test_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from parsel import Selector
from parsel.selector import (
_load_object,
CannotRemoveElementWithoutRoot,
CannotRemoveElementWithoutParent,
)
Expand Down Expand Up @@ -913,3 +914,24 @@ def test_set(self):
//div[@itemtype="http://schema.org/Event"]
//*[@itemscope]/*/@itemprop)''').extract(),
[u'url', u'name', u'startDate', u'location', u'offers'])


try:
ModuleNotFoundError
except NameError:
ModuleNotFoundError = ImportError


class LoadObjectTestCase(unittest.TestCase):

def test_incomplete_path(self):
with self.assertRaises(ValueError):
object = _load_object('parsel')

def test_inexistent_module(self):
with self.assertRaises(ModuleNotFoundError):
object = _load_object('parsel.inexistent.inexistent')

def test_inexistent_object(self):
with self.assertRaises(NameError):
object = _load_object('parsel.parser.inexistent')