diff --git a/itemloaders/__init__.py b/itemloaders/__init__.py index acaae48..de51e98 100644 --- a/itemloaders/__init__.py +++ b/itemloaders/__init__.py @@ -86,7 +86,7 @@ class ItemLoader: default_input_processor = Identity() default_output_processor = Identity() - def __init__(self, item=None, selector=None, parent=None, **context): + def __init__(self, item=None, selector=None, parent=None, stats=None, **context): self.selector = selector context.update(selector=selector) if item is None: @@ -99,6 +99,14 @@ def __init__(self, item=None, selector=None, parent=None, **context): for field_name, value in item.items(): self._values[field_name] += arg_to_iter(value) + # This is the new injected dependency that we'll be using as the main + # functionality of this tool. + self.stats = stats + + # This keeps track of the position of the 'field' name that is being + # loaded for a more accurate logging in the stats. + self.field_position_tracker = defaultdict(int) + @property def _values(self): if self.parent is not None: @@ -327,14 +335,15 @@ def add_xpath(self, field_name, xpath, *processors, **kw): loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)') """ - values = self._get_xpathvalues(xpath, **kw) + self.field_position_tracker[f"{field_name}_xpath"] += 1 + values = self.get_selector_values(field_name, xpath, 'xpath', **kw) self.add_value(field_name, values, *processors, **kw) def replace_xpath(self, field_name, xpath, *processors, **kw): """ Similar to :meth:`add_xpath` but replaces collected data instead of adding it. """ - values = self._get_xpathvalues(xpath, **kw) + values = self.get_selector_values(field_name, xpath, 'xpath', **kw) self.replace_value(field_name, values, *processors, **kw) def get_xpath(self, xpath, *processors, **kw): @@ -358,14 +367,9 @@ def get_xpath(self, xpath, *processors, **kw): loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)') """ - values = self._get_xpathvalues(xpath, **kw) + values = self.get_selector_values(None, xpath, 'xpath', **kw) return self.get_value(values, *processors, **kw) - def _get_xpathvalues(self, xpaths, **kw): - self._check_selector_method() - xpaths = arg_to_iter(xpaths) - return flatten(self.selector.xpath(xpath).getall() for xpath in xpaths) - def add_css(self, field_name, css, *processors, **kw): """ Similar to :meth:`ItemLoader.add_value` but receives a CSS selector @@ -384,14 +388,15 @@ def add_css(self, field_name, css, *processors, **kw): # HTML snippet:

the price is $1200

loader.add_css('price', 'p#price', re='the price is (.*)') """ - values = self._get_cssvalues(css, **kw) + self.field_position_tracker[f"{field_name}_css"] += 1 + values = self.get_selector_values(field_name, css, 'css', **kw) self.add_value(field_name, values, *processors, **kw) def replace_css(self, field_name, css, *processors, **kw): """ Similar to :meth:`add_css` but replaces collected data instead of adding it. """ - values = self._get_cssvalues(css, **kw) + values = self.get_selector_values(field_name, css, 'css', **kw) self.replace_value(field_name, values, *processors, **kw) def get_css(self, css, *processors, **kw): @@ -414,10 +419,57 @@ def get_css(self, css, *processors, **kw): # HTML snippet:

the price is $1200

loader.get_css('p#price', TakeFirst(), re='the price is (.*)') """ - values = self._get_cssvalues(css, **kw) + values = self.get_selector_values(None, css, 'css', **kw) return self.get_value(values, *processors, **kw) - def _get_cssvalues(self, csss, **kw): + def get_selector_values(self, field_name, selector_rules, selector_type, **kw): + self._check_selector_method() - csss = arg_to_iter(csss) - return flatten(self.selector.css(css).getall() for css in csss) + + selector = getattr(self.selector, selector_type or '', None) + + # The optional arg in methods like `add_css()` for context in stats + name = kw.get("name") + + # For every call of `add_css()` and `add_xpath()` this is incremented. + # We'll use it as the base index of the position of the logged stats. + index = self.field_position_tracker[f"{field_name}_{selector_type}"] + + values = [] + for position, rule in enumerate(arg_to_iter(selector_rules), index): + parsed_data = selector(rule).getall() + values.append(parsed_data) + self.write_to_stats( + field_name, parsed_data, position, selector_type, name=name + ) + return flatten(values) + + def write_to_stats( + self, field_name, parsed_data, position, selector_type, name=None + ): + """Responsible for logging the parser rules usage. + + The implementation below where each missing parsed_data is being logged + to the stat is clunky, but necessary. With this, we can only surmise + that it's safe to remove parser fallback parser if it's all just + '.../missing' in the stats. + """ + + if not self.stats or not field_name: + return + + parser_label = ( + f"parser/{self.loader_name}/{field_name}/{selector_type}/{position}" + ) + + if name: + parser_label += f"/{name}" + + if parsed_data in (None, []): + parser_label += "/missing" + + self.stats.inc_value(parser_label) + + @property + def loader_name(self): + return self.__class__.__name__ diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_loader_stats.py b/tests/test_loader_stats.py new file mode 100644 index 0000000..9325e5b --- /dev/null +++ b/tests/test_loader_stats.py @@ -0,0 +1,175 @@ +import pytest +from unittest import mock + +from itemloaders import ItemLoader +from parsel import Selector + + +def test_write_to_stats_with_uninjected_stat_dependency(): + """It should not call stats when the stat dependency isn't available.""" + + loader = ItemLoader() + loader.stats = mock.MagicMock() + loader.stats.__bool__.return_value = False # don't pass the if-condition + + assert loader.write_to_stats("field_name", "parsed_data", 0, "xpath") == None + assert not loader.stats.inc_value.called + + +def test_write_to_stats_with_no_parsed_data(): + """It should not call stats when parsing the data returned None.""" + + loader = ItemLoader() + loader.stats = mock.Mock() + + parsed_data = None + expected_stat_key = "parser/ItemLoader/field_name/css/0/missing" + + assert loader.write_to_stats("field_name", parsed_data, 0, "css") == None + loader.stats.inc_value.assert_called_once_with(expected_stat_key) + + +def test_write_to_stats_with_no_field_name(): + """It should not call stats when the 'field_name' passed is None.""" + + loader = ItemLoader() + loader.stats = mock.Mock() + + assert loader.write_to_stats(None, "sample data", 0, "css") == None + loader.stats.inc_value.assert_not_called() + + +def test_write_to_stats(): + """It should incremenent the correct key in the stat.""" + + loader = ItemLoader() + loader.stats = mock.MagicMock() + + expected_stat_key = "parser/ItemLoader/field_name/css/0" + + # Rules with values + assert loader.write_to_stats("field_name", "parsed_data", 123, "css") == None + + # Rules that hasn't rendered any values + assert loader.write_to_stats("field_name", None, 456, "css") == None + assert loader.write_to_stats("field_name", [], 789, "css") == None + + loader.stats.inc_value.assert_has_calls( + [ + mock.call("parser/ItemLoader/field_name/css/123"), + mock.call("parser/ItemLoader/field_name/css/456/missing"), + mock.call("parser/ItemLoader/field_name/css/789/missing"), + ] + ) + + +TEST_HTML_BODY = """ + + This is a title + +
+

Product #1

+ $1.23 +
+ +
+
Product #2
+ $9.99 +
+ + +""" + + +class TestItemLoader(ItemLoader): + pass + + +@pytest.fixture() +def loader(): + mock_stats = mock.MagicMock() + selector = Selector(text=TEST_HTML_BODY) + loader = TestItemLoader(selector=selector, stats=mock_stats) + return loader + + +# NOTES: We'll be using the 'css' methods of ItemLoader below. The 'xpath' +# methods are also using the 'get_selector_values()' method underneath, the +# same with 'css'. So we'll assume that 'xpath' would also pass the test +# if 'css' passes. + +# This assumption will hold true for now, since the current implementation of +# the 'css' and 'xpath' methods are just facades to the 'get_selector_values()'. + + +def test_add_css_1(loader): + loader.add_css("title", "article h2::text") + loader.stats.inc_value.assert_has_calls( + [mock.call("parser/TestItemLoader/title/css/1")] + ) + assert loader.stats.inc_value.call_count == 1 + + +def test_add_css_2(loader): + loader.add_css("title", ["article h2::text", "article .product-title::text"]) + loader.stats.inc_value.assert_has_calls( + [ + mock.call("parser/TestItemLoader/title/css/1"), + mock.call("parser/TestItemLoader/title/css/2"), + ] + ) + assert loader.stats.inc_value.call_count == 2 + + +def test_add_css_3_missing(loader): + loader.add_css("title", "h1::text") # The

doesn't exist at all. + loader.stats.inc_value.assert_has_calls( + [mock.call("parser/TestItemLoader/title/css/1/missing")] + ) + assert loader.stats.inc_value.call_count == 1 + + +def test_multiple_1(loader): + loader.add_css("title", "h2::text") + loader.add_css("title", ["article h2::text", "article .product-title::text"]) + loader.stats.inc_value.assert_has_calls( + [ + mock.call("parser/TestItemLoader/title/css/1"), + mock.call("parser/TestItemLoader/title/css/2"), + mock.call("parser/TestItemLoader/title/css/3"), + ] + ) + assert loader.stats.inc_value.call_count == 3 + + +def test_multiple_1_with_name(loader): + loader.add_css("title", "h2::text", name="title from h2") + loader.add_css( + "title", + ["article h2::text", "article .product-title::text"], + name="title from article", + ) + loader.stats.inc_value.assert_has_calls( + [ + mock.call("parser/TestItemLoader/title/css/1/title from h2"), + mock.call("parser/TestItemLoader/title/css/2/title from article"), + mock.call("parser/TestItemLoader/title/css/3/title from article"), + ] + ) + assert loader.stats.inc_value.call_count == 3 + + +def test_multiple_2_with_name(loader): + loader.add_css("title", "h2::text", name="title from h2") + loader.add_xpath("title", "//article/h2/text()", name="title from article") + loader.add_css("title", "article .product-title::text") + loader.add_xpath("title", "//aside/h1/text()", name="title from aside") + loader.stats.inc_value.assert_has_calls( + [ + mock.call("parser/TestItemLoader/title/css/1/title from h2"), + mock.call("parser/TestItemLoader/title/xpath/1/title from article"), + mock.call("parser/TestItemLoader/title/css/2"), + mock.call("parser/TestItemLoader/title/xpath/2/title from aside/missing"), + ] + ) + assert loader.stats.inc_value.call_count == 4 diff --git a/tests/test_selector_loader.py b/tests/test_selector_loader.py index 3de5d63..d0e6a0e 100644 --- a/tests/test_selector_loader.py +++ b/tests/test_selector_loader.py @@ -1,6 +1,9 @@ import unittest +import pytest +from unittest import mock from parsel import Selector +from parsel.utils import flatten from itemloaders import ItemLoader from itemloaders.processors import MapCompose, TakeFirst @@ -151,3 +154,59 @@ def test_replace_css_re(self): self.assertEqual(loader.get_output_value('url'), ['http://www.scrapy.org']) loader.replace_css('url', 'a::attr(href)', re=r'http://www\.(.+)') self.assertEqual(loader.get_output_value('url'), ['scrapy.org']) + + +def test_get_selector_values_with_no_selector(): + """It should raise an error if it's not configured with any Selector.""" + + loader = ItemLoader() + + with pytest.raises(RuntimeError) as err: + loader.get_selector_values("field_name", [], None) + + +def test_get_selector_values(): + """Selectors must be properly called as well as correctly flatten the data. + + For this test, we're testing 'css', but it should also work the same for 'xpath'. + """ + + selector_rules = ["#rule1", "#rule2", "#rule3"] + field_name = "field" + parsed_data = ["data1", "data2"] + + mock_css_selector = mock.Mock() + mock_css_selector().getall.return_value = parsed_data + mock_css_selector.__name__ = "css" + + mock_selector = mock.Mock() + mock_selector.css = mock_css_selector + + loader = ItemLoader(selector=mock_selector) + loader.write_to_stats = mock.Mock() + + # This wasn't actually initialized so it will return 0 by default otherwise. + loader.field_position_tracker["field_css"] = 1 + + result = loader.get_selector_values(field_name, selector_rules, "css") + + assert result == flatten([parsed_data] * len(selector_rules)) + + mock_selector.assert_has_calls( + [ + mock.call.css(selector_rules[0]), + mock.call.css().getall(), + mock.call.css(selector_rules[1]), + mock.call.css().getall(), + mock.call.css(selector_rules[2]), + mock.call.css().getall(), + ] + ) + + loader.write_to_stats.assert_has_calls( + [ + mock.call(field_name, parsed_data, 1, "css", name=None), + mock.call(field_name, parsed_data, 2, "css", name=None), + mock.call(field_name, parsed_data, 3, "css", name=None), + ] + )