scrapy · BurnzZ · May 12, 2020 · May 12, 2020 · May 13, 2020 · May 13, 2020
diff --git a/itemloaders/__init__.py b/itemloaders/__init__.py
@@ -86,7 +86,7 @@ class ItemLoader:
     default_input_processor = Identity()
     default_output_processor = Identity()
 
-    def __init__(self, item=None, selector=None, parent=None, **context):
+    def __init__(self, item=None, selector=None, parent=None, stats=None, **context):
         self.selector = selector
         context.update(selector=selector)
         if item is None:
@@ -99,6 +99,14 @@ def __init__(self, item=None, selector=None, parent=None, **context):
         for field_name, value in item.items():
             self._values[field_name] += arg_to_iter(value)
 
+        # This is the new injected dependency that we'll be using as the main
+        # functionality of this tool.
+        self.stats = stats
+
+        # This keeps track of the position of the 'field' name that is being
+        # loaded for a more accurate logging in the stats.
+        self.field_position_tracker = defaultdict(int)
+
     @property
     def _values(self):
         if self.parent is not None:
@@ -327,14 +335,15 @@ def add_xpath(self, field_name, xpath, *processors, **kw):
             loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
 
         """
-        values = self._get_xpathvalues(xpath, **kw)
+        self.field_position_tracker[f"{field_name}_xpath"] += 1
+        values = self.get_selector_values(field_name, xpath, 'xpath', **kw)
         self.add_value(field_name, values, *processors, **kw)
 
     def replace_xpath(self, field_name, xpath, *processors, **kw):
         """
         Similar to :meth:`add_xpath` but replaces collected data instead of adding it.
         """
-        values = self._get_xpathvalues(xpath, **kw)
+        values = self.get_selector_values(field_name, xpath, 'xpath', **kw)
         self.replace_value(field_name, values, *processors, **kw)
 
     def get_xpath(self, xpath, *processors, **kw):
@@ -358,14 +367,9 @@ def get_xpath(self, xpath, *processors, **kw):
             loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')
 
         """
-        values = self._get_xpathvalues(xpath, **kw)
+        values = self.get_selector_values(None, xpath, 'xpath', **kw)
         return self.get_value(values, *processors, **kw)
 
-    def _get_xpathvalues(self, xpaths, **kw):
-        self._check_selector_method()
-        xpaths = arg_to_iter(xpaths)
-        return flatten(self.selector.xpath(xpath).getall() for xpath in xpaths)
-
     def add_css(self, field_name, css, *processors, **kw):
         """
         Similar to :meth:`ItemLoader.add_value` but receives a CSS selector
@@ -384,14 +388,15 @@ def add_css(self, field_name, css, *processors, **kw):
             # HTML snippet: <p id="price">the price is $1200</p>
             loader.add_css('price', 'p#price', re='the price is (.*)')
         """
-        values = self._get_cssvalues(css, **kw)
+        self.field_position_tracker[f"{field_name}_css"] += 1
+        values = self.get_selector_values(field_name, css, 'css', **kw)
         self.add_value(field_name, values, *processors, **kw)
 
     def replace_css(self, field_name, css, *processors, **kw):
         """
         Similar to :meth:`add_css` but replaces collected data instead of adding it.
         """
-        values = self._get_cssvalues(css, **kw)
+        values = self.get_selector_values(field_name, css, 'css', **kw)
         self.replace_value(field_name, values, *processors, **kw)
 
     def get_css(self, css, *processors, **kw):
@@ -414,10 +419,57 @@ def get_css(self, css, *processors, **kw):
             # HTML snippet: <p id="price">the price is $1200</p>
             loader.get_css('p#price', TakeFirst(), re='the price is (.*)')
         """
-        values = self._get_cssvalues(css, **kw)
+        values = self.get_selector_values(None, css, 'css', **kw)
         return self.get_value(values, *processors, **kw)
 
-    def _get_cssvalues(self, csss, **kw):
+    def get_selector_values(self, field_name, selector_rules, selector_type, **kw):
+
         self._check_selector_method()
-        csss = arg_to_iter(csss)
-        return flatten(self.selector.css(css).getall() for css in csss)
+
+        selector = getattr(self.selector, selector_type or '', None)
+
+        # The optional arg in methods like `add_css()` for context in stats
+        name = kw.get("name")
+
+        # For every call of `add_css()` and `add_xpath()` this is incremented.
+        # We'll use it as the base index of the position of the logged stats.
+        index = self.field_position_tracker[f"{field_name}_{selector_type}"]
+
+        values = []
+        for position, rule in enumerate(arg_to_iter(selector_rules), index):
+            parsed_data = selector(rule).getall()
+            values.append(parsed_data)
+            self.write_to_stats(
+                field_name, parsed_data, position, selector_type, name=name
+            )
+        return flatten(values)
+
+    def write_to_stats(
+        self, field_name, parsed_data, position, selector_type, name=None
+    ):
+        """Responsible for logging the parser rules usage.
+
+        The implementation below where each missing parsed_data is being logged
+        to the stat is clunky, but necessary. With this, we can only surmise
+        that it's safe to remove parser fallback parser if it's all just
+        '.../missing' in the stats.
+        """
+
+        if not self.stats or not field_name:
+            return
+
+        parser_label = (
+            f"parser/{self.loader_name}/{field_name}/{selector_type}/{position}"
+        )
+
+        if name:
+            parser_label += f"/{name}"
+
+        if parsed_data in (None, []):
+            parser_label += "/missing"
+
+        self.stats.inc_value(parser_label)
+
+    @property
+    def loader_name(self):
+        return self.__class__.__name__
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_loader_stats.py b/tests/test_loader_stats.py
@@ -0,0 +1,175 @@
+import pytest
+from unittest import mock
+
+from itemloaders import ItemLoader
+from parsel import Selector
+
+
+def test_write_to_stats_with_uninjected_stat_dependency():
+    """It should not call stats when the stat dependency isn't available."""
+
+    loader = ItemLoader()
+    loader.stats = mock.MagicMock()
+    loader.stats.__bool__.return_value = False  # don't pass the if-condition
+
+    assert loader.write_to_stats("field_name", "parsed_data", 0, "xpath") == None
+    assert not loader.stats.inc_value.called
+
+
+def test_write_to_stats_with_no_parsed_data():
+    """It should not call stats when parsing the data returned None."""
+
+    loader = ItemLoader()
+    loader.stats = mock.Mock()
+
+    parsed_data = None
+    expected_stat_key = "parser/ItemLoader/field_name/css/0/missing"
+
+    assert loader.write_to_stats("field_name", parsed_data, 0, "css") == None
+    loader.stats.inc_value.assert_called_once_with(expected_stat_key)
+
+
+def test_write_to_stats_with_no_field_name():
+    """It should not call stats when the 'field_name' passed is None."""
+
+    loader = ItemLoader()
+    loader.stats = mock.Mock()
+
+    assert loader.write_to_stats(None, "sample data", 0, "css") == None
+    loader.stats.inc_value.assert_not_called()
+
+
+def test_write_to_stats():
+    """It should incremenent the correct key in the stat."""
+
+    loader = ItemLoader()
+    loader.stats = mock.MagicMock()
+
+    expected_stat_key = "parser/ItemLoader/field_name/css/0"
+
+    # Rules with values
+    assert loader.write_to_stats("field_name", "parsed_data", 123, "css") == None
+
+    # Rules that hasn't rendered any values
+    assert loader.write_to_stats("field_name", None, 456, "css") == None
+    assert loader.write_to_stats("field_name", [], 789, "css") == None
+
+    loader.stats.inc_value.assert_has_calls(
+        [
+            mock.call("parser/ItemLoader/field_name/css/123"),
+            mock.call("parser/ItemLoader/field_name/css/456/missing"),
+            mock.call("parser/ItemLoader/field_name/css/789/missing"),
+        ]
+    )
+
+
+TEST_HTML_BODY = """
+<html>
+    <title>This is a title</title>
+    <body>
+        <article>
+            <h2>Product #1</h2>
+            <span class='price'>$1.23</span>
+        </article>
+
+        <article>
+            <div class='product-title'>Product #2</div>
+            <span class='price'>$9.99</span>
+        </article>
+    </body>
+</html>
+"""
+
+
+class TestItemLoader(ItemLoader):
+    pass
+
+
+@pytest.fixture()
+def loader():
+    mock_stats = mock.MagicMock()
+    selector = Selector(text=TEST_HTML_BODY)
+    loader = TestItemLoader(selector=selector, stats=mock_stats)
+    return loader
+
+
+# NOTES: We'll be using the 'css' methods of ItemLoader below. The 'xpath'
+#   methods are also using the 'get_selector_values()' method underneath, the
+#   same with 'css'. So we'll assume that 'xpath' would also pass the test
+#   if 'css' passes.
+
+# This assumption will hold true for now, since the current implementation of
+# the 'css' and 'xpath' methods are just facades to the 'get_selector_values()'.
+
+
+def test_add_css_1(loader):
+    loader.add_css("title", "article h2::text")
+    loader.stats.inc_value.assert_has_calls(
+        [mock.call("parser/TestItemLoader/title/css/1")]
+    )
+    assert loader.stats.inc_value.call_count == 1
+
+
+def test_add_css_2(loader):
+    loader.add_css("title", ["article h2::text", "article .product-title::text"])
+    loader.stats.inc_value.assert_has_calls(
+        [
+            mock.call("parser/TestItemLoader/title/css/1"),
+            mock.call("parser/TestItemLoader/title/css/2"),
+        ]
+    )
+    assert loader.stats.inc_value.call_count == 2
+
+
+def test_add_css_3_missing(loader):
+    loader.add_css("title", "h1::text")  # The <h1> doesn't exist at all.
+    loader.stats.inc_value.assert_has_calls(
+        [mock.call("parser/TestItemLoader/title/css/1/missing")]
+    )
+    assert loader.stats.inc_value.call_count == 1
+
+
+def test_multiple_1(loader):
+    loader.add_css("title", "h2::text")
+    loader.add_css("title", ["article h2::text", "article .product-title::text"])
+    loader.stats.inc_value.assert_has_calls(
+        [
+            mock.call("parser/TestItemLoader/title/css/1"),
+            mock.call("parser/TestItemLoader/title/css/2"),
+            mock.call("parser/TestItemLoader/title/css/3"),
+        ]
+    )
+    assert loader.stats.inc_value.call_count == 3
+
+
+def test_multiple_1_with_name(loader):
+    loader.add_css("title", "h2::text", name="title from h2")
+    loader.add_css(
+        "title",
+        ["article h2::text", "article .product-title::text"],
+        name="title from article",
+    )
+    loader.stats.inc_value.assert_has_calls(
+        [
+            mock.call("parser/TestItemLoader/title/css/1/title from h2"),
+            mock.call("parser/TestItemLoader/title/css/2/title from article"),
+            mock.call("parser/TestItemLoader/title/css/3/title from article"),
+        ]
+    )
+    assert loader.stats.inc_value.call_count == 3
+
+
+def test_multiple_2_with_name(loader):
+    loader.add_css("title", "h2::text", name="title from h2")
+    loader.add_xpath("title", "//article/h2/text()", name="title from article")
+    loader.add_css("title", "article .product-title::text")
+    loader.add_xpath("title", "//aside/h1/text()", name="title from aside")
+    loader.stats.inc_value.assert_has_calls(
+        [
+            mock.call("parser/TestItemLoader/title/css/1/title from h2"),
+            mock.call("parser/TestItemLoader/title/xpath/1/title from article"),
+            mock.call("parser/TestItemLoader/title/css/2"),
+            mock.call("parser/TestItemLoader/title/xpath/2/title from aside/missing"),
+        ]
+    )
+    assert loader.stats.inc_value.call_count == 4
diff --git a/tests/test_selector_loader.py b/tests/test_selector_loader.py
@@ -1,6 +1,9 @@
 import unittest
+import pytest
+from unittest import mock
 
 from parsel import Selector
+from parsel.utils import flatten
 
 from itemloaders import ItemLoader
 from itemloaders.processors import MapCompose, TakeFirst
@@ -151,3 +154,59 @@ def test_replace_css_re(self):
         self.assertEqual(loader.get_output_value('url'), ['http://www.scrapy.org'])
         loader.replace_css('url', 'a::attr(href)', re=r'http://www\.(.+)')
         self.assertEqual(loader.get_output_value('url'), ['scrapy.org'])
+
+
+def test_get_selector_values_with_no_selector():
+    """It should raise an error if it's not configured with any Selector."""
+
+    loader = ItemLoader()
+
+    with pytest.raises(RuntimeError) as err:
+        loader.get_selector_values("field_name", [], None)
+
+
+def test_get_selector_values():
+    """Selectors must be properly called as well as correctly flatten the data.
+
+    For this test, we're testing 'css', but it should also work the same for 'xpath'.
+    """
+
+    selector_rules = ["#rule1", "#rule2", "#rule3"]
+    field_name = "field"
+    parsed_data = ["data1", "data2"]
+
+    mock_css_selector = mock.Mock()
+    mock_css_selector().getall.return_value = parsed_data
+    mock_css_selector.__name__ = "css"
+
+    mock_selector = mock.Mock()
+    mock_selector.css = mock_css_selector
+
+    loader = ItemLoader(selector=mock_selector)
+    loader.write_to_stats = mock.Mock()
+
+    # This wasn't actually initialized so it will return 0 by default otherwise.
+    loader.field_position_tracker["field_css"] = 1
+
+    result = loader.get_selector_values(field_name, selector_rules, "css")
+
+    assert result == flatten([parsed_data] * len(selector_rules))
+
+    mock_selector.assert_has_calls(
+        [
+            mock.call.css(selector_rules[0]),
+            mock.call.css().getall(),
+            mock.call.css(selector_rules[1]),
+            mock.call.css().getall(),
+            mock.call.css(selector_rules[2]),
+            mock.call.css().getall(),
+        ]
+    )
+
+    loader.write_to_stats.assert_has_calls(
+        [
+            mock.call(field_name, parsed_data, 1, "css", name=None),
+            mock.call(field_name, parsed_data, 2, "css", name=None),
+            mock.call(field_name, parsed_data, 3, "css", name=None),
+        ]
+    )