Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scrapy-loader-upkeep migration #12

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 67 additions & 15 deletions itemloaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ class ItemLoader:
default_input_processor = Identity()
default_output_processor = Identity()

def __init__(self, item=None, selector=None, parent=None, **context):
def __init__(self, item=None, selector=None, parent=None, stats=None, **context):
self.selector = selector
context.update(selector=selector)
if item is None:
Expand All @@ -99,6 +99,14 @@ def __init__(self, item=None, selector=None, parent=None, **context):
for field_name, value in item.items():
self._values[field_name] += arg_to_iter(value)

# This is the new injected dependency that we'll be using as the main
# functionality of this tool.
self.stats = stats

# This keeps track of the position of the 'field' name that is being
# loaded for a more accurate logging in the stats.
self.field_position_tracker = defaultdict(int)

@property
def _values(self):
if self.parent is not None:
Expand Down Expand Up @@ -327,14 +335,15 @@ def add_xpath(self, field_name, xpath, *processors, **kw):
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')

"""
values = self._get_xpathvalues(xpath, **kw)
self.field_position_tracker[f"{field_name}_xpath"] += 1
values = self.get_selector_values(field_name, xpath, 'xpath', **kw)
self.add_value(field_name, values, *processors, **kw)

def replace_xpath(self, field_name, xpath, *processors, **kw):
"""
Similar to :meth:`add_xpath` but replaces collected data instead of adding it.
"""
values = self._get_xpathvalues(xpath, **kw)
values = self.get_selector_values(field_name, xpath, 'xpath', **kw)
self.replace_value(field_name, values, *processors, **kw)

def get_xpath(self, xpath, *processors, **kw):
Expand All @@ -358,14 +367,9 @@ def get_xpath(self, xpath, *processors, **kw):
loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')

"""
values = self._get_xpathvalues(xpath, **kw)
values = self.get_selector_values(None, xpath, 'xpath', **kw)
return self.get_value(values, *processors, **kw)

def _get_xpathvalues(self, xpaths, **kw):
self._check_selector_method()
xpaths = arg_to_iter(xpaths)
return flatten(self.selector.xpath(xpath).getall() for xpath in xpaths)

def add_css(self, field_name, css, *processors, **kw):
"""
Similar to :meth:`ItemLoader.add_value` but receives a CSS selector
Expand All @@ -384,14 +388,15 @@ def add_css(self, field_name, css, *processors, **kw):
# HTML snippet: <p id="price">the price is $1200</p>
loader.add_css('price', 'p#price', re='the price is (.*)')
"""
values = self._get_cssvalues(css, **kw)
self.field_position_tracker[f"{field_name}_css"] += 1
values = self.get_selector_values(field_name, css, 'css', **kw)
self.add_value(field_name, values, *processors, **kw)

def replace_css(self, field_name, css, *processors, **kw):
"""
Similar to :meth:`add_css` but replaces collected data instead of adding it.
"""
values = self._get_cssvalues(css, **kw)
values = self.get_selector_values(field_name, css, 'css', **kw)
self.replace_value(field_name, values, *processors, **kw)

def get_css(self, css, *processors, **kw):
Expand All @@ -414,10 +419,57 @@ def get_css(self, css, *processors, **kw):
# HTML snippet: <p id="price">the price is $1200</p>
loader.get_css('p#price', TakeFirst(), re='the price is (.*)')
"""
values = self._get_cssvalues(css, **kw)
values = self.get_selector_values(None, css, 'css', **kw)
return self.get_value(values, *processors, **kw)

def _get_cssvalues(self, csss, **kw):
def get_selector_values(self, field_name, selector_rules, selector_type, **kw):

self._check_selector_method()
csss = arg_to_iter(csss)
return flatten(self.selector.css(css).getall() for css in csss)

selector = getattr(self.selector, selector_type or '', None)

# The optional arg in methods like `add_css()` for context in stats
name = kw.get("name")

# For every call of `add_css()` and `add_xpath()` this is incremented.
# We'll use it as the base index of the position of the logged stats.
index = self.field_position_tracker[f"{field_name}_{selector_type}"]

values = []
for position, rule in enumerate(arg_to_iter(selector_rules), index):
parsed_data = selector(rule).getall()
values.append(parsed_data)
self.write_to_stats(
field_name, parsed_data, position, selector_type, name=name
)
return flatten(values)

def write_to_stats(
self, field_name, parsed_data, position, selector_type, name=None
):
"""Responsible for logging the parser rules usage.

The implementation below where each missing parsed_data is being logged
to the stat is clunky, but necessary. With this, we can only surmise
that it's safe to remove parser fallback parser if it's all just
'.../missing' in the stats.
"""

if not self.stats or not field_name:
return

parser_label = (
f"parser/{self.loader_name}/{field_name}/{selector_type}/{position}"
)

if name:
parser_label += f"/{name}"

if parsed_data in (None, []):
parser_label += "/missing"

self.stats.inc_value(parser_label)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes itemloaders depend on scrapy.stats, at least as far as interface is concerned. Do you think we can avoid it? I can see 2 main ways:

  • ask for a different stats object here, maybe a plain dictionary; allow to pass real stats in Scrapy's itemloader
  • move stats handling to Scrapy's wrapper instead of this package

We can also start decoupling scrapy Stats from scrapy, but I'd rather not do this now.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @kmike , the itemloaders accept Scrapy's stats instance via DI and could work without it, as per this.

Accepting the actual Scrapy stats instance is crucial for the loader's usage to be logged at the end of a job (and be logged in Scrapy Cloud's stats-tab), like this:

2019-06-16 14:32:32 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{ ...
  'parser/QuotesItemLoader/author/css/1': 10,
  'parser/QuotesItemLoader/quote/css/1/missing': 10,
  'parser/QuotesItemLoader/quote/css/2': 10
  ...
}

I'm all good for making it like a defaultdict(int) to function like the stat's inc_value, as long as we can pass it out to Scrapy's logging system.

However, I might be missing something? 🤔Is there another way to circumvent injecting Scrapy's stats to the itemloaders and still log the parser's usage?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess @kmike 's point here is the implicit dependency.
Even though, scrapy is not a requirement, we require a direct dependency from scrapy..
So, maybe a solution would be to create some sort of adapter (another one).
Though, I somehow foresee a problem.

If we allow ItemLoader to be extended with upkeep, we'll need to make upkeep the default base class for scrapy, as scrapy users we'll keep using scrapy's ItemLoader which is an extension to this one.
So, if they import this extension instead of scrapy's, they'll be losing scrapy's behavior.


@property
def loader_name(self):
return self.__class__.__name__
Empty file added tests/__init__.py
Empty file.
175 changes: 175 additions & 0 deletions tests/test_loader_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import pytest
from unittest import mock

from itemloaders import ItemLoader
from parsel import Selector


def test_write_to_stats_with_uninjected_stat_dependency():
"""It should not call stats when the stat dependency isn't available."""

loader = ItemLoader()
loader.stats = mock.MagicMock()
loader.stats.__bool__.return_value = False # don't pass the if-condition

assert loader.write_to_stats("field_name", "parsed_data", 0, "xpath") == None
assert not loader.stats.inc_value.called


def test_write_to_stats_with_no_parsed_data():
"""It should not call stats when parsing the data returned None."""

loader = ItemLoader()
loader.stats = mock.Mock()

parsed_data = None
expected_stat_key = "parser/ItemLoader/field_name/css/0/missing"

assert loader.write_to_stats("field_name", parsed_data, 0, "css") == None
loader.stats.inc_value.assert_called_once_with(expected_stat_key)


def test_write_to_stats_with_no_field_name():
"""It should not call stats when the 'field_name' passed is None."""

loader = ItemLoader()
loader.stats = mock.Mock()

assert loader.write_to_stats(None, "sample data", 0, "css") == None
loader.stats.inc_value.assert_not_called()


def test_write_to_stats():
"""It should incremenent the correct key in the stat."""

loader = ItemLoader()
loader.stats = mock.MagicMock()

expected_stat_key = "parser/ItemLoader/field_name/css/0"

# Rules with values
assert loader.write_to_stats("field_name", "parsed_data", 123, "css") == None

# Rules that hasn't rendered any values
assert loader.write_to_stats("field_name", None, 456, "css") == None
assert loader.write_to_stats("field_name", [], 789, "css") == None

loader.stats.inc_value.assert_has_calls(
[
mock.call("parser/ItemLoader/field_name/css/123"),
mock.call("parser/ItemLoader/field_name/css/456/missing"),
mock.call("parser/ItemLoader/field_name/css/789/missing"),
]
)


TEST_HTML_BODY = """
<html>
<title>This is a title</title>
<body>
<article>
<h2>Product #1</h2>
<span class='price'>$1.23</span>
</article>

<article>
<div class='product-title'>Product #2</div>
<span class='price'>$9.99</span>
</article>
</body>
</html>
"""


class TestItemLoader(ItemLoader):
pass


@pytest.fixture()
def loader():
mock_stats = mock.MagicMock()
selector = Selector(text=TEST_HTML_BODY)
loader = TestItemLoader(selector=selector, stats=mock_stats)
return loader


# NOTES: We'll be using the 'css' methods of ItemLoader below. The 'xpath'
# methods are also using the 'get_selector_values()' method underneath, the
# same with 'css'. So we'll assume that 'xpath' would also pass the test
# if 'css' passes.

# This assumption will hold true for now, since the current implementation of
# the 'css' and 'xpath' methods are just facades to the 'get_selector_values()'.


def test_add_css_1(loader):
loader.add_css("title", "article h2::text")
loader.stats.inc_value.assert_has_calls(
[mock.call("parser/TestItemLoader/title/css/1")]
)
assert loader.stats.inc_value.call_count == 1


def test_add_css_2(loader):
loader.add_css("title", ["article h2::text", "article .product-title::text"])
loader.stats.inc_value.assert_has_calls(
[
mock.call("parser/TestItemLoader/title/css/1"),
mock.call("parser/TestItemLoader/title/css/2"),
]
)
assert loader.stats.inc_value.call_count == 2


def test_add_css_3_missing(loader):
loader.add_css("title", "h1::text") # The <h1> doesn't exist at all.
loader.stats.inc_value.assert_has_calls(
[mock.call("parser/TestItemLoader/title/css/1/missing")]
)
assert loader.stats.inc_value.call_count == 1


def test_multiple_1(loader):
loader.add_css("title", "h2::text")
loader.add_css("title", ["article h2::text", "article .product-title::text"])
loader.stats.inc_value.assert_has_calls(
[
mock.call("parser/TestItemLoader/title/css/1"),
mock.call("parser/TestItemLoader/title/css/2"),
mock.call("parser/TestItemLoader/title/css/3"),
]
)
assert loader.stats.inc_value.call_count == 3


def test_multiple_1_with_name(loader):
loader.add_css("title", "h2::text", name="title from h2")
loader.add_css(
"title",
["article h2::text", "article .product-title::text"],
name="title from article",
)
loader.stats.inc_value.assert_has_calls(
[
mock.call("parser/TestItemLoader/title/css/1/title from h2"),
mock.call("parser/TestItemLoader/title/css/2/title from article"),
mock.call("parser/TestItemLoader/title/css/3/title from article"),
]
)
assert loader.stats.inc_value.call_count == 3


def test_multiple_2_with_name(loader):
loader.add_css("title", "h2::text", name="title from h2")
loader.add_xpath("title", "//article/h2/text()", name="title from article")
loader.add_css("title", "article .product-title::text")
loader.add_xpath("title", "//aside/h1/text()", name="title from aside")
loader.stats.inc_value.assert_has_calls(
[
mock.call("parser/TestItemLoader/title/css/1/title from h2"),
mock.call("parser/TestItemLoader/title/xpath/1/title from article"),
mock.call("parser/TestItemLoader/title/css/2"),
mock.call("parser/TestItemLoader/title/xpath/2/title from aside/missing"),
]
)
assert loader.stats.inc_value.call_count == 4
59 changes: 59 additions & 0 deletions tests/test_selector_loader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import unittest
import pytest
from unittest import mock

from parsel import Selector
from parsel.utils import flatten

from itemloaders import ItemLoader
from itemloaders.processors import MapCompose, TakeFirst
Expand Down Expand Up @@ -151,3 +154,59 @@ def test_replace_css_re(self):
self.assertEqual(loader.get_output_value('url'), ['http://www.scrapy.org'])
loader.replace_css('url', 'a::attr(href)', re=r'http://www\.(.+)')
self.assertEqual(loader.get_output_value('url'), ['scrapy.org'])


def test_get_selector_values_with_no_selector():
"""It should raise an error if it's not configured with any Selector."""

loader = ItemLoader()

with pytest.raises(RuntimeError) as err:
loader.get_selector_values("field_name", [], None)


def test_get_selector_values():
"""Selectors must be properly called as well as correctly flatten the data.

For this test, we're testing 'css', but it should also work the same for 'xpath'.
"""

selector_rules = ["#rule1", "#rule2", "#rule3"]
field_name = "field"
parsed_data = ["data1", "data2"]

mock_css_selector = mock.Mock()
mock_css_selector().getall.return_value = parsed_data
mock_css_selector.__name__ = "css"

mock_selector = mock.Mock()
mock_selector.css = mock_css_selector

loader = ItemLoader(selector=mock_selector)
loader.write_to_stats = mock.Mock()

# This wasn't actually initialized so it will return 0 by default otherwise.
loader.field_position_tracker["field_css"] = 1

result = loader.get_selector_values(field_name, selector_rules, "css")

assert result == flatten([parsed_data] * len(selector_rules))

mock_selector.assert_has_calls(
[
mock.call.css(selector_rules[0]),
mock.call.css().getall(),
mock.call.css(selector_rules[1]),
mock.call.css().getall(),
mock.call.css(selector_rules[2]),
mock.call.css().getall(),
]
)

loader.write_to_stats.assert_has_calls(
[
mock.call(field_name, parsed_data, 1, "css", name=None),
mock.call(field_name, parsed_data, 2, "css", name=None),
mock.call(field_name, parsed_data, 3, "css", name=None),
]
)