diff --git a/itemloaders/__init__.py b/itemloaders/__init__.py index acaae48..de51e98 100644 --- a/itemloaders/__init__.py +++ b/itemloaders/__init__.py @@ -86,7 +86,7 @@ class ItemLoader: default_input_processor = Identity() default_output_processor = Identity() - def __init__(self, item=None, selector=None, parent=None, **context): + def __init__(self, item=None, selector=None, parent=None, stats=None, **context): self.selector = selector context.update(selector=selector) if item is None: @@ -99,6 +99,14 @@ def __init__(self, item=None, selector=None, parent=None, **context): for field_name, value in item.items(): self._values[field_name] += arg_to_iter(value) + # This is the new injected dependency that we'll be using as the main + # functionality of this tool. + self.stats = stats + + # This keeps track of the position of the 'field' name that is being + # loaded for a more accurate logging in the stats. + self.field_position_tracker = defaultdict(int) + @property def _values(self): if self.parent is not None: @@ -327,14 +335,15 @@ def add_xpath(self, field_name, xpath, *processors, **kw): loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)') """ - values = self._get_xpathvalues(xpath, **kw) + self.field_position_tracker[f"{field_name}_xpath"] += 1 + values = self.get_selector_values(field_name, xpath, 'xpath', **kw) self.add_value(field_name, values, *processors, **kw) def replace_xpath(self, field_name, xpath, *processors, **kw): """ Similar to :meth:`add_xpath` but replaces collected data instead of adding it. """ - values = self._get_xpathvalues(xpath, **kw) + values = self.get_selector_values(field_name, xpath, 'xpath', **kw) self.replace_value(field_name, values, *processors, **kw) def get_xpath(self, xpath, *processors, **kw): @@ -358,14 +367,9 @@ def get_xpath(self, xpath, *processors, **kw): loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)') """ - values = self._get_xpathvalues(xpath, **kw) + values = self.get_selector_values(None, xpath, 'xpath', **kw) return self.get_value(values, *processors, **kw) - def _get_xpathvalues(self, xpaths, **kw): - self._check_selector_method() - xpaths = arg_to_iter(xpaths) - return flatten(self.selector.xpath(xpath).getall() for xpath in xpaths) - def add_css(self, field_name, css, *processors, **kw): """ Similar to :meth:`ItemLoader.add_value` but receives a CSS selector @@ -384,14 +388,15 @@ def add_css(self, field_name, css, *processors, **kw): # HTML snippet:
the price is $1200
loader.add_css('price', 'p#price', re='the price is (.*)') """ - values = self._get_cssvalues(css, **kw) + self.field_position_tracker[f"{field_name}_css"] += 1 + values = self.get_selector_values(field_name, css, 'css', **kw) self.add_value(field_name, values, *processors, **kw) def replace_css(self, field_name, css, *processors, **kw): """ Similar to :meth:`add_css` but replaces collected data instead of adding it. """ - values = self._get_cssvalues(css, **kw) + values = self.get_selector_values(field_name, css, 'css', **kw) self.replace_value(field_name, values, *processors, **kw) def get_css(self, css, *processors, **kw): @@ -414,10 +419,57 @@ def get_css(self, css, *processors, **kw): # HTML snippet:the price is $1200
loader.get_css('p#price', TakeFirst(), re='the price is (.*)') """ - values = self._get_cssvalues(css, **kw) + values = self.get_selector_values(None, css, 'css', **kw) return self.get_value(values, *processors, **kw) - def _get_cssvalues(self, csss, **kw): + def get_selector_values(self, field_name, selector_rules, selector_type, **kw): + self._check_selector_method() - csss = arg_to_iter(csss) - return flatten(self.selector.css(css).getall() for css in csss) + + selector = getattr(self.selector, selector_type or '', None) + + # The optional arg in methods like `add_css()` for context in stats + name = kw.get("name") + + # For every call of `add_css()` and `add_xpath()` this is incremented. + # We'll use it as the base index of the position of the logged stats. + index = self.field_position_tracker[f"{field_name}_{selector_type}"] + + values = [] + for position, rule in enumerate(arg_to_iter(selector_rules), index): + parsed_data = selector(rule).getall() + values.append(parsed_data) + self.write_to_stats( + field_name, parsed_data, position, selector_type, name=name + ) + return flatten(values) + + def write_to_stats( + self, field_name, parsed_data, position, selector_type, name=None + ): + """Responsible for logging the parser rules usage. + + The implementation below where each missing parsed_data is being logged + to the stat is clunky, but necessary. With this, we can only surmise + that it's safe to remove parser fallback parser if it's all just + '.../missing' in the stats. + """ + + if not self.stats or not field_name: + return + + parser_label = ( + f"parser/{self.loader_name}/{field_name}/{selector_type}/{position}" + ) + + if name: + parser_label += f"/{name}" + + if parsed_data in (None, []): + parser_label += "/missing" + + self.stats.inc_value(parser_label) + + @property + def loader_name(self): + return self.__class__.__name__ diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_loader_stats.py b/tests/test_loader_stats.py new file mode 100644 index 0000000..9325e5b --- /dev/null +++ b/tests/test_loader_stats.py @@ -0,0 +1,175 @@ +import pytest +from unittest import mock + +from itemloaders import ItemLoader +from parsel import Selector + + +def test_write_to_stats_with_uninjected_stat_dependency(): + """It should not call stats when the stat dependency isn't available.""" + + loader = ItemLoader() + loader.stats = mock.MagicMock() + loader.stats.__bool__.return_value = False # don't pass the if-condition + + assert loader.write_to_stats("field_name", "parsed_data", 0, "xpath") == None + assert not loader.stats.inc_value.called + + +def test_write_to_stats_with_no_parsed_data(): + """It should not call stats when parsing the data returned None.""" + + loader = ItemLoader() + loader.stats = mock.Mock() + + parsed_data = None + expected_stat_key = "parser/ItemLoader/field_name/css/0/missing" + + assert loader.write_to_stats("field_name", parsed_data, 0, "css") == None + loader.stats.inc_value.assert_called_once_with(expected_stat_key) + + +def test_write_to_stats_with_no_field_name(): + """It should not call stats when the 'field_name' passed is None.""" + + loader = ItemLoader() + loader.stats = mock.Mock() + + assert loader.write_to_stats(None, "sample data", 0, "css") == None + loader.stats.inc_value.assert_not_called() + + +def test_write_to_stats(): + """It should incremenent the correct key in the stat.""" + + loader = ItemLoader() + loader.stats = mock.MagicMock() + + expected_stat_key = "parser/ItemLoader/field_name/css/0" + + # Rules with values + assert loader.write_to_stats("field_name", "parsed_data", 123, "css") == None + + # Rules that hasn't rendered any values + assert loader.write_to_stats("field_name", None, 456, "css") == None + assert loader.write_to_stats("field_name", [], 789, "css") == None + + loader.stats.inc_value.assert_has_calls( + [ + mock.call("parser/ItemLoader/field_name/css/123"), + mock.call("parser/ItemLoader/field_name/css/456/missing"), + mock.call("parser/ItemLoader/field_name/css/789/missing"), + ] + ) + + +TEST_HTML_BODY = """ + +