From 05ddbc0d4b5718d31e3f589848377011867dbad9 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 12 May 2020 13:42:00 +0800 Subject: [PATCH 1/7] initial migration of Burnzz/scrapy-loader-upkeep code --- itemloaders/__init__.py | 94 +++++++++++++++++++++++++++++++++++------ 1 file changed, 80 insertions(+), 14 deletions(-) diff --git a/itemloaders/__init__.py b/itemloaders/__init__.py index acaae48..9c505cd 100644 --- a/itemloaders/__init__.py +++ b/itemloaders/__init__.py @@ -86,7 +86,7 @@ class ItemLoader: default_input_processor = Identity() default_output_processor = Identity() - def __init__(self, item=None, selector=None, parent=None, **context): + def __init__(self, item=None, selector=None, parent=None, stats=None, **context): self.selector = selector context.update(selector=selector) if item is None: @@ -99,6 +99,14 @@ def __init__(self, item=None, selector=None, parent=None, **context): for field_name, value in item.items(): self._values[field_name] += arg_to_iter(value) + # This is the new injected dependency that we'll be using as the main + # functionality of this tool. + self.stats = stats + + # This keeps track of the position of the 'field' name that is being + # loaded for a more accurate logging in the stats. + self.field_tracker = defaultdict(int) + @property def _values(self): if self.parent is not None: @@ -327,14 +335,15 @@ def add_xpath(self, field_name, xpath, *processors, **kw): loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)') """ - values = self._get_xpathvalues(xpath, **kw) + self.field_tracker[f"{field_name}_xpath"] += 1 + values = self._get_xpathvalues(field_name, xpath, **kw) self.add_value(field_name, values, *processors, **kw) def replace_xpath(self, field_name, xpath, *processors, **kw): """ Similar to :meth:`add_xpath` but replaces collected data instead of adding it. """ - values = self._get_xpathvalues(xpath, **kw) + values = self._get_xpathvalues(field_name, xpath, **kw) self.replace_value(field_name, values, *processors, **kw) def get_xpath(self, xpath, *processors, **kw): @@ -358,13 +367,11 @@ def get_xpath(self, xpath, *processors, **kw): loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)') """ - values = self._get_xpathvalues(xpath, **kw) + values = self._get_xpathvalues(None, xpath, **kw) return self.get_value(values, *processors, **kw) - def _get_xpathvalues(self, xpaths, **kw): - self._check_selector_method() - xpaths = arg_to_iter(xpaths) - return flatten(self.selector.xpath(xpath).getall() for xpath in xpaths) + def _get_xpathvalues(self, field_name, xpaths, **kw): + return self.get_selector_values(field_name, xpaths, self.selector.xpath, **kw) def add_css(self, field_name, css, *processors, **kw): """ @@ -384,14 +391,15 @@ def add_css(self, field_name, css, *processors, **kw): # HTML snippet:

the price is $1200

loader.add_css('price', 'p#price', re='the price is (.*)') """ - values = self._get_cssvalues(css, **kw) + self.field_tracker[f"{field_name}_css"] += 1 + values = self._get_cssvalues(field_name, css, **kw) self.add_value(field_name, values, *processors, **kw) def replace_css(self, field_name, css, *processors, **kw): """ Similar to :meth:`add_css` but replaces collected data instead of adding it. """ - values = self._get_cssvalues(css, **kw) + values = self._get_cssvalues(field_name, css, **kw) self.replace_value(field_name, values, *processors, **kw) def get_css(self, css, *processors, **kw): @@ -414,10 +422,68 @@ def get_css(self, css, *processors, **kw): # HTML snippet:

the price is $1200

loader.get_css('p#price', TakeFirst(), re='the price is (.*)') """ - values = self._get_cssvalues(css, **kw) + values = self._get_cssvalues(None, css, **kw) return self.get_value(values, *processors, **kw) - def _get_cssvalues(self, csss, **kw): + def _get_cssvalues(self, field_name, csss, **kw): + return self.get_selector_values(field_name, csss, self.selector.css, **kw) + + def get_selector_values(self, field_name, selector_rules, selector, **kw): + """Provides an abstraction to _get_xpathvalues() and _get_cssvalues() + since they share the same components. + """ + self._check_selector_method() - csss = arg_to_iter(csss) - return flatten(self.selector.css(css).getall() for css in csss) + + selector_type = selector.__name__ # either 'css' or 'xpath' + + # The optional arg in methods like `add_css()` for context in stats + name = kw.get("name") + + # For every call of `add_css()` and `add_xpath()` this is incremented. + # We'll use it as the base index of the position of the logged stats. + index = self.field_tracker[f"{field_name}_{selector_type}"] + + values = [] + for position, rule in enumerate(arg_to_iter(selector_rules), index): + parsed_data = selector(rule).getall() + values.append(parsed_data) + self.write_to_stats( + field_name, parsed_data, position, selector_type, name=name + ) + return flatten(values) + + def write_to_stats( + self, field_name, parsed_data, position, selector_type, name=None + ): + """Responsible for logging the parser rules usage. + + NOTES: It's hard to easily denote which parser rule hasn't produced any + data for the entire crawl, since ItemLoaders essentially don't know + when the spider is going to be closed, as well as it has many + instantiations all throughout the code. + + The implementation below where each missing parsed_data is being logged + to the stat is clunky, but necessary. With this, we can only surmise + that it's safe to remove parser fallback parser if it's all just + '*/missing' in the stats. + """ + + if not self.stats or not field_name: + return + + parser_label = ( + f"parser/{self.loader_name}/{field_name}/{selector_type}/{position}" + ) + + if name: + parser_label += f"/{name}" + + if parsed_data in (None, []): + parser_label += "/missing" + + self.stats.inc_value(parser_label) + + @property + def loader_name(self): + return self.__class__.__name__ From 1df3891e058827609c0dcad80c5f56032e3a2c4b Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 12 May 2020 13:45:05 +0800 Subject: [PATCH 2/7] add __init__.py to tests/ --- tests/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/__init__.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 From 42296f2e312c56398ecb4adaa2c9d0ea718a0e00 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 13 May 2020 14:04:11 +0800 Subject: [PATCH 3/7] fix failing tests by combining _get_xpathvalues() and _get_cssvalues() --- itemloaders/__init__.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/itemloaders/__init__.py b/itemloaders/__init__.py index 9c505cd..8b1b720 100644 --- a/itemloaders/__init__.py +++ b/itemloaders/__init__.py @@ -336,14 +336,14 @@ def add_xpath(self, field_name, xpath, *processors, **kw): """ self.field_tracker[f"{field_name}_xpath"] += 1 - values = self._get_xpathvalues(field_name, xpath, **kw) + values = self.get_selector_values(field_name, xpath, 'xpath', **kw) self.add_value(field_name, values, *processors, **kw) def replace_xpath(self, field_name, xpath, *processors, **kw): """ Similar to :meth:`add_xpath` but replaces collected data instead of adding it. """ - values = self._get_xpathvalues(field_name, xpath, **kw) + values = self.get_selector_values(field_name, xpath, 'xpath', **kw) self.replace_value(field_name, values, *processors, **kw) def get_xpath(self, xpath, *processors, **kw): @@ -367,12 +367,9 @@ def get_xpath(self, xpath, *processors, **kw): loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)') """ - values = self._get_xpathvalues(None, xpath, **kw) + values = self.get_selector_values(None, xpath, 'xpath', **kw) return self.get_value(values, *processors, **kw) - def _get_xpathvalues(self, field_name, xpaths, **kw): - return self.get_selector_values(field_name, xpaths, self.selector.xpath, **kw) - def add_css(self, field_name, css, *processors, **kw): """ Similar to :meth:`ItemLoader.add_value` but receives a CSS selector @@ -392,14 +389,14 @@ def add_css(self, field_name, css, *processors, **kw): loader.add_css('price', 'p#price', re='the price is (.*)') """ self.field_tracker[f"{field_name}_css"] += 1 - values = self._get_cssvalues(field_name, css, **kw) + values = self.get_selector_values(field_name, css, 'css', **kw) self.add_value(field_name, values, *processors, **kw) def replace_css(self, field_name, css, *processors, **kw): """ Similar to :meth:`add_css` but replaces collected data instead of adding it. """ - values = self._get_cssvalues(field_name, css, **kw) + values = self.get_selector_values(field_name, css, 'css', **kw) self.replace_value(field_name, values, *processors, **kw) def get_css(self, css, *processors, **kw): @@ -422,16 +419,12 @@ def get_css(self, css, *processors, **kw): # HTML snippet:

the price is $1200

loader.get_css('p#price', TakeFirst(), re='the price is (.*)') """ - values = self._get_cssvalues(None, css, **kw) + values = self.get_selector_values(None, css, 'css', **kw) return self.get_value(values, *processors, **kw) - def _get_cssvalues(self, field_name, csss, **kw): - return self.get_selector_values(field_name, csss, self.selector.css, **kw) + def get_selector_values(self, field_name, selector_rules, selector_name, **kw): - def get_selector_values(self, field_name, selector_rules, selector, **kw): - """Provides an abstraction to _get_xpathvalues() and _get_cssvalues() - since they share the same components. - """ + selector = getattr(self.selector, selector_name, None) self._check_selector_method() From 57f5308bef8876f195700bc45b03da8929dae8ab Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 13 May 2020 14:44:48 +0800 Subject: [PATCH 4/7] streamline get_selector_values() and add tests to it --- itemloaders/__init__.py | 6 ++-- tests/test_selector_loader.py | 59 +++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/itemloaders/__init__.py b/itemloaders/__init__.py index 8b1b720..b85ebf0 100644 --- a/itemloaders/__init__.py +++ b/itemloaders/__init__.py @@ -422,13 +422,11 @@ def get_css(self, css, *processors, **kw): values = self.get_selector_values(None, css, 'css', **kw) return self.get_value(values, *processors, **kw) - def get_selector_values(self, field_name, selector_rules, selector_name, **kw): - - selector = getattr(self.selector, selector_name, None) + def get_selector_values(self, field_name, selector_rules, selector_type, **kw): self._check_selector_method() - selector_type = selector.__name__ # either 'css' or 'xpath' + selector = getattr(self.selector, selector_type or '', None) # The optional arg in methods like `add_css()` for context in stats name = kw.get("name") diff --git a/tests/test_selector_loader.py b/tests/test_selector_loader.py index 3de5d63..626e0f6 100644 --- a/tests/test_selector_loader.py +++ b/tests/test_selector_loader.py @@ -1,6 +1,9 @@ import unittest +import pytest +from unittest import mock from parsel import Selector +from parsel.utils import flatten from itemloaders import ItemLoader from itemloaders.processors import MapCompose, TakeFirst @@ -151,3 +154,59 @@ def test_replace_css_re(self): self.assertEqual(loader.get_output_value('url'), ['http://www.scrapy.org']) loader.replace_css('url', 'a::attr(href)', re=r'http://www\.(.+)') self.assertEqual(loader.get_output_value('url'), ['scrapy.org']) + + +def test_get_selector_values_with_no_selector(): + """It should raise an error if it's not configured with any Selector.""" + + loader = ItemLoader() + + with pytest.raises(RuntimeError) as err: + loader.get_selector_values("field_name", [], None) + + +def test_get_selector_values(): + """Selectors must be properly called as well as correctly flatten the data. + + For this test, we're testing 'css', but it should also work the same for 'xpath'. + """ + + selector_rules = ["#rule1", "#rule2", "#rule3"] + field_name = "field" + parsed_data = ["data1", "data2"] + + mock_css_selector = mock.Mock() + mock_css_selector().getall.return_value = parsed_data + mock_css_selector.__name__ = "css" + + mock_selector = mock.Mock() + mock_selector.css = mock_css_selector + + loader = ItemLoader(selector=mock_selector) + loader.write_to_stats = mock.Mock() + + # This wasn't actually initialized so it will return 0 by default otherwise. + loader.field_tracker["field_css"] = 1 + + result = loader.get_selector_values(field_name, selector_rules, "css") + + assert result == flatten([parsed_data] * len(selector_rules)) + + mock_selector.assert_has_calls( + [ + mock.call.css(selector_rules[0]), + mock.call.css().getall(), + mock.call.css(selector_rules[1]), + mock.call.css().getall(), + mock.call.css(selector_rules[2]), + mock.call.css().getall(), + ] + ) + + loader.write_to_stats.assert_has_calls( + [ + mock.call(field_name, parsed_data, 1, "css", name=None), + mock.call(field_name, parsed_data, 2, "css", name=None), + mock.call(field_name, parsed_data, 3, "css", name=None), + ] + ) From d4d3671ea6c3e2f8e617e5c0cca52cc6710503b2 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 14 May 2020 14:59:04 +0800 Subject: [PATCH 5/7] add tests to stat logging feature --- tests/test_loader_stats.py | 175 +++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 tests/test_loader_stats.py diff --git a/tests/test_loader_stats.py b/tests/test_loader_stats.py new file mode 100644 index 0000000..9325e5b --- /dev/null +++ b/tests/test_loader_stats.py @@ -0,0 +1,175 @@ +import pytest +from unittest import mock + +from itemloaders import ItemLoader +from parsel import Selector + + +def test_write_to_stats_with_uninjected_stat_dependency(): + """It should not call stats when the stat dependency isn't available.""" + + loader = ItemLoader() + loader.stats = mock.MagicMock() + loader.stats.__bool__.return_value = False # don't pass the if-condition + + assert loader.write_to_stats("field_name", "parsed_data", 0, "xpath") == None + assert not loader.stats.inc_value.called + + +def test_write_to_stats_with_no_parsed_data(): + """It should not call stats when parsing the data returned None.""" + + loader = ItemLoader() + loader.stats = mock.Mock() + + parsed_data = None + expected_stat_key = "parser/ItemLoader/field_name/css/0/missing" + + assert loader.write_to_stats("field_name", parsed_data, 0, "css") == None + loader.stats.inc_value.assert_called_once_with(expected_stat_key) + + +def test_write_to_stats_with_no_field_name(): + """It should not call stats when the 'field_name' passed is None.""" + + loader = ItemLoader() + loader.stats = mock.Mock() + + assert loader.write_to_stats(None, "sample data", 0, "css") == None + loader.stats.inc_value.assert_not_called() + + +def test_write_to_stats(): + """It should incremenent the correct key in the stat.""" + + loader = ItemLoader() + loader.stats = mock.MagicMock() + + expected_stat_key = "parser/ItemLoader/field_name/css/0" + + # Rules with values + assert loader.write_to_stats("field_name", "parsed_data", 123, "css") == None + + # Rules that hasn't rendered any values + assert loader.write_to_stats("field_name", None, 456, "css") == None + assert loader.write_to_stats("field_name", [], 789, "css") == None + + loader.stats.inc_value.assert_has_calls( + [ + mock.call("parser/ItemLoader/field_name/css/123"), + mock.call("parser/ItemLoader/field_name/css/456/missing"), + mock.call("parser/ItemLoader/field_name/css/789/missing"), + ] + ) + + +TEST_HTML_BODY = """ + + This is a title + +
+

Product #1

+ $1.23 +
+ +
+
Product #2
+ $9.99 +
+ + +""" + + +class TestItemLoader(ItemLoader): + pass + + +@pytest.fixture() +def loader(): + mock_stats = mock.MagicMock() + selector = Selector(text=TEST_HTML_BODY) + loader = TestItemLoader(selector=selector, stats=mock_stats) + return loader + + +# NOTES: We'll be using the 'css' methods of ItemLoader below. The 'xpath' +# methods are also using the 'get_selector_values()' method underneath, the +# same with 'css'. So we'll assume that 'xpath' would also pass the test +# if 'css' passes. + +# This assumption will hold true for now, since the current implementation of +# the 'css' and 'xpath' methods are just facades to the 'get_selector_values()'. + + +def test_add_css_1(loader): + loader.add_css("title", "article h2::text") + loader.stats.inc_value.assert_has_calls( + [mock.call("parser/TestItemLoader/title/css/1")] + ) + assert loader.stats.inc_value.call_count == 1 + + +def test_add_css_2(loader): + loader.add_css("title", ["article h2::text", "article .product-title::text"]) + loader.stats.inc_value.assert_has_calls( + [ + mock.call("parser/TestItemLoader/title/css/1"), + mock.call("parser/TestItemLoader/title/css/2"), + ] + ) + assert loader.stats.inc_value.call_count == 2 + + +def test_add_css_3_missing(loader): + loader.add_css("title", "h1::text") # The

doesn't exist at all. + loader.stats.inc_value.assert_has_calls( + [mock.call("parser/TestItemLoader/title/css/1/missing")] + ) + assert loader.stats.inc_value.call_count == 1 + + +def test_multiple_1(loader): + loader.add_css("title", "h2::text") + loader.add_css("title", ["article h2::text", "article .product-title::text"]) + loader.stats.inc_value.assert_has_calls( + [ + mock.call("parser/TestItemLoader/title/css/1"), + mock.call("parser/TestItemLoader/title/css/2"), + mock.call("parser/TestItemLoader/title/css/3"), + ] + ) + assert loader.stats.inc_value.call_count == 3 + + +def test_multiple_1_with_name(loader): + loader.add_css("title", "h2::text", name="title from h2") + loader.add_css( + "title", + ["article h2::text", "article .product-title::text"], + name="title from article", + ) + loader.stats.inc_value.assert_has_calls( + [ + mock.call("parser/TestItemLoader/title/css/1/title from h2"), + mock.call("parser/TestItemLoader/title/css/2/title from article"), + mock.call("parser/TestItemLoader/title/css/3/title from article"), + ] + ) + assert loader.stats.inc_value.call_count == 3 + + +def test_multiple_2_with_name(loader): + loader.add_css("title", "h2::text", name="title from h2") + loader.add_xpath("title", "//article/h2/text()", name="title from article") + loader.add_css("title", "article .product-title::text") + loader.add_xpath("title", "//aside/h1/text()", name="title from aside") + loader.stats.inc_value.assert_has_calls( + [ + mock.call("parser/TestItemLoader/title/css/1/title from h2"), + mock.call("parser/TestItemLoader/title/xpath/1/title from article"), + mock.call("parser/TestItemLoader/title/css/2"), + mock.call("parser/TestItemLoader/title/xpath/2/title from aside/missing"), + ] + ) + assert loader.stats.inc_value.call_count == 4 From 17139441f8199a2fdf96eb73c27c0619d634502c Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 14 May 2020 16:09:25 +0800 Subject: [PATCH 6/7] remove docstring in write_to_stats() causing sphinx build to fail --- itemloaders/__init__.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/itemloaders/__init__.py b/itemloaders/__init__.py index b85ebf0..98e6231 100644 --- a/itemloaders/__init__.py +++ b/itemloaders/__init__.py @@ -449,15 +449,10 @@ def write_to_stats( ): """Responsible for logging the parser rules usage. - NOTES: It's hard to easily denote which parser rule hasn't produced any - data for the entire crawl, since ItemLoaders essentially don't know - when the spider is going to be closed, as well as it has many - instantiations all throughout the code. - The implementation below where each missing parsed_data is being logged to the stat is clunky, but necessary. With this, we can only surmise that it's safe to remove parser fallback parser if it's all just - '*/missing' in the stats. + '.../missing' in the stats. """ if not self.stats or not field_name: From dd96b7e4466354b5d270cedd5dd66da313ccd0a8 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 26 May 2020 11:41:29 +0800 Subject: [PATCH 7/7] rename field_tracker into field_position_tracker for more clarity --- itemloaders/__init__.py | 8 ++++---- tests/test_selector_loader.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/itemloaders/__init__.py b/itemloaders/__init__.py index 98e6231..de51e98 100644 --- a/itemloaders/__init__.py +++ b/itemloaders/__init__.py @@ -105,7 +105,7 @@ def __init__(self, item=None, selector=None, parent=None, stats=None, **context) # This keeps track of the position of the 'field' name that is being # loaded for a more accurate logging in the stats. - self.field_tracker = defaultdict(int) + self.field_position_tracker = defaultdict(int) @property def _values(self): @@ -335,7 +335,7 @@ def add_xpath(self, field_name, xpath, *processors, **kw): loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)') """ - self.field_tracker[f"{field_name}_xpath"] += 1 + self.field_position_tracker[f"{field_name}_xpath"] += 1 values = self.get_selector_values(field_name, xpath, 'xpath', **kw) self.add_value(field_name, values, *processors, **kw) @@ -388,7 +388,7 @@ def add_css(self, field_name, css, *processors, **kw): # HTML snippet:

the price is $1200

loader.add_css('price', 'p#price', re='the price is (.*)') """ - self.field_tracker[f"{field_name}_css"] += 1 + self.field_position_tracker[f"{field_name}_css"] += 1 values = self.get_selector_values(field_name, css, 'css', **kw) self.add_value(field_name, values, *processors, **kw) @@ -433,7 +433,7 @@ def get_selector_values(self, field_name, selector_rules, selector_type, **kw): # For every call of `add_css()` and `add_xpath()` this is incremented. # We'll use it as the base index of the position of the logged stats. - index = self.field_tracker[f"{field_name}_{selector_type}"] + index = self.field_position_tracker[f"{field_name}_{selector_type}"] values = [] for position, rule in enumerate(arg_to_iter(selector_rules), index): diff --git a/tests/test_selector_loader.py b/tests/test_selector_loader.py index 626e0f6..d0e6a0e 100644 --- a/tests/test_selector_loader.py +++ b/tests/test_selector_loader.py @@ -186,7 +186,7 @@ def test_get_selector_values(): loader.write_to_stats = mock.Mock() # This wasn't actually initialized so it will return 0 by default otherwise. - loader.field_tracker["field_css"] = 1 + loader.field_position_tracker["field_css"] = 1 result = loader.get_selector_values(field_name, selector_rules, "css")