From 091f3a56f82ff377dfa4b3f7b5f1169c6e165949 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 19 Jul 2024 02:15:43 -0400 Subject: [PATCH 1/6] chore(refactor): Move sorted_versions function into caller --- docs/news.rst | 11 ++++ scrapyd/eggstorage.py | 9 ++- scrapyd/utils.py | 8 --- tests/test_eggstorage.py | 14 +++- tests/test_utils.py | 137 --------------------------------------- 5 files changed, 32 insertions(+), 147 deletions(-) delete mode 100644 tests/test_utils.py diff --git a/docs/news.rst b/docs/news.rst index b4b5ee09..77702c43 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -57,6 +57,17 @@ CLI - Remove all ``twistd`` subcommands (FTP servers, etc.). Run ``twistd``, if needed. - Run the ``scrapyd.__main__`` module, instead of the ``scrapyd.scripts.scrapyd_run`` module. +Utils +^^^^^ + +Move functions from ``scrapyd.utils`` into their callers: + +- ``sorted_versions`` to ``scrapyd.eggstorage`` +- ``get_crawl_args`` to ``scrapyd.launcher`` +- ``JsonResource``, ``get_spider_list`` and ``UtilsCache`` to ``scrapyd.webservice`` + +Move ``activate_egg`` from ``scrapyd.eggutils`` to ``scrapyd.runner`` + Fixed ~~~~~ diff --git a/scrapyd/eggstorage.py b/scrapyd/eggstorage.py index ad1aceeb..eb735bb6 100644 --- a/scrapyd/eggstorage.py +++ b/scrapyd/eggstorage.py @@ -3,11 +3,18 @@ import shutil from glob import escape, glob +from packaging.version import InvalidVersion, Version from zope.interface import implementer from scrapyd.exceptions import DirectoryTraversalError, EggNotFoundError, ProjectNotFoundError from scrapyd.interfaces import IEggStorage -from scrapyd.utils import sorted_versions + + +def sorted_versions(versions): + try: + return sorted(versions, key=Version) + except InvalidVersion: + return sorted(versions) @implementer(IEggStorage) diff --git a/scrapyd/utils.py b/scrapyd/utils.py index df1dbe26..aaa6219f 100644 --- a/scrapyd/utils.py +++ b/scrapyd/utils.py @@ -5,7 +5,6 @@ from typing import ClassVar from urllib.parse import urlsplit -from packaging.version import InvalidVersion, Version from scrapy.utils.misc import load_object from twisted.web import resource @@ -174,10 +173,3 @@ def _to_native_str(text, encoding="utf-8", errors="strict"): if isinstance(text, str): return text return text.decode(encoding, errors) - - -def sorted_versions(versions): - try: - return sorted(versions, key=Version) - except InvalidVersion: - return sorted(versions) diff --git a/tests/test_eggstorage.py b/tests/test_eggstorage.py index d231b3c5..f5318e00 100644 --- a/tests/test_eggstorage.py +++ b/tests/test_eggstorage.py @@ -8,11 +8,23 @@ from scrapyd.app import application from scrapyd.config import Config -from scrapyd.eggstorage import FilesystemEggStorage +from scrapyd.eggstorage import FilesystemEggStorage, sorted_versions from scrapyd.exceptions import DirectoryTraversalError from scrapyd.interfaces import IEggStorage +@pytest.mark.parametrize( + ("versions", "expected"), + [ + (["zzz", "b", "ddd", "a", "x"], ["a", "b", "ddd", "x", "zzz"]), + (["10", "1", "9"], ["1", "9", "10"]), + (["2.11", "2.01", "2.9"], ["2.01", "2.9", "2.11"]), + ], +) +def test_sorted_versions(versions, expected): + assert sorted_versions(versions) == expected + + @implementer(IEggStorage) class SomeFakeEggStorage: def __init__(self, config): diff --git a/tests/test_utils.py b/tests/test_utils.py deleted file mode 100644 index 634c1548..00000000 --- a/tests/test_utils.py +++ /dev/null @@ -1,137 +0,0 @@ -import os -from io import BytesIO -from pkgutil import get_data -from subprocess import Popen -from unittest import mock - -import pytest -from scrapy.utils.test import get_pythonpath -from twisted.trial import unittest - -from scrapyd import get_application -from scrapyd.exceptions import RunnerError -from scrapyd.interfaces import IEggStorage -from scrapyd.utils import UtilsCache, get_crawl_args, get_spider_list, sorted_versions - - -def get_pythonpath_scrapyd(): - scrapyd_path = __import__("scrapyd").__path__[0] - return os.path.join(os.path.dirname(scrapyd_path), get_pythonpath(), os.environ.get("PYTHONPATH", "")) - - -class UtilsTest(unittest.TestCase): - def test_get_crawl_args(self): - msg = {"_project": "lolo", "_spider": "lala"} - - self.assertEqual(get_crawl_args(msg), ["lala"]) - - msg = {"_project": "lolo", "_spider": "lala", "arg1": "val1"} - cargs = get_crawl_args(msg) - - self.assertEqual(cargs, ["lala", "-a", "arg1=val1"]) - self.assertTrue(all(isinstance(x, str) for x in cargs), cargs) - - def test_get_crawl_args_with_settings(self): - msg = {"_project": "lolo", "_spider": "lala", "arg1": "val1", "settings": {"ONE": "two"}} - cargs = get_crawl_args(msg) - - self.assertEqual(cargs, ["lala", "-a", "arg1=val1", "-s", "ONE=two"]) - self.assertTrue(all(isinstance(x, str) for x in cargs), cargs) - - -class GetSpiderListTest(unittest.TestCase): - def setUp(self): - path = os.path.abspath(self.mktemp()) - j = os.path.join - eggs_dir = j(path, "eggs") - os.makedirs(eggs_dir) - dbs_dir = j(path, "dbs") - os.makedirs(dbs_dir) - logs_dir = j(path, "logs") - os.makedirs(logs_dir) - os.chdir(path) - with open("scrapyd.conf", "w") as f: - f.write("[scrapyd]\n") - f.write(f"eggs_dir = {eggs_dir}\n") - f.write(f"dbs_dir = {dbs_dir}\n") - f.write(f"logs_dir = {logs_dir}\n") - self.app = get_application() - - def add_test_version(self, file, project, version): - eggstorage = self.app.getComponent(IEggStorage) - eggfile = BytesIO(get_data("tests", file)) - eggstorage.put(eggfile, project, version) - - def test_get_spider_list_log_stdout(self): - self.add_test_version("logstdout.egg", "logstdout", "logstdout") - spiders = get_spider_list("logstdout", pythonpath=get_pythonpath_scrapyd()) - # If LOG_STDOUT were respected, the output would be []. - self.assertEqual(sorted(spiders), ["spider1", "spider2"]) - - def test_get_spider_list(self): - # mybot.egg has two spiders, spider1 and spider2 - self.add_test_version("mybot.egg", "mybot", "r1") - spiders = get_spider_list("mybot", pythonpath=get_pythonpath_scrapyd()) - self.assertEqual(sorted(spiders), ["spider1", "spider2"]) - - # mybot2.egg has three spiders, spider1, spider2 and spider3... - # BUT you won't see it here because it's cached. - # Effectivelly it's like if version was never added - self.add_test_version("mybot2.egg", "mybot", "r2") - spiders = get_spider_list("mybot", pythonpath=get_pythonpath_scrapyd()) - self.assertEqual(sorted(spiders), ["spider1", "spider2"]) - - # Let's invalidate the cache for this project... - UtilsCache.invalid_cache("mybot") - - # Now you get the updated list - spiders = get_spider_list("mybot", pythonpath=get_pythonpath_scrapyd()) - self.assertEqual(sorted(spiders), ["spider1", "spider2", "spider3"]) - - # Let's re-deploy mybot.egg and clear cache. It now sees 2 spiders - self.add_test_version("mybot.egg", "mybot", "r3") - UtilsCache.invalid_cache("mybot") - spiders = get_spider_list("mybot", pythonpath=get_pythonpath_scrapyd()) - self.assertEqual(sorted(spiders), ["spider1", "spider2"]) - - # And re-deploying the one with three (mybot2.egg) with a version that - # isn't the higher, won't change what get_spider_list() returns. - self.add_test_version("mybot2.egg", "mybot", "r1a") - UtilsCache.invalid_cache("mybot") - spiders = get_spider_list("mybot", pythonpath=get_pythonpath_scrapyd()) - self.assertEqual(sorted(spiders), ["spider1", "spider2"]) - - @pytest.mark.skipif(os.name == "nt", reason="get_spider_list() unicode fails on windows") - def test_get_spider_list_unicode(self): - # mybotunicode.egg has two spiders, araña1 and araña2 - self.add_test_version("mybotunicode.egg", "mybotunicode", "r1") - spiders = get_spider_list("mybotunicode", pythonpath=get_pythonpath_scrapyd()) - - self.assertEqual(sorted(spiders), ["araña1", "araña2"]) - - def test_failed_spider_list(self): - self.add_test_version("mybot3.egg", "mybot3", "r1") - pypath = get_pythonpath_scrapyd() - # Workaround missing support for context manager in twisted < 15 - - # Add -W ignore to sub-python to prevent warnings & tb mixup in stderr - def popen_wrapper(*args, **kwargs): - cmd, args = args[0], args[1:] - cmd = [cmd[0], "-W", "ignore"] + cmd[1:] - return Popen(cmd, *args, **kwargs) - - with mock.patch("scrapyd.utils.Popen", wraps=popen_wrapper): - exc = self.assertRaises(RunnerError, get_spider_list, "mybot3", pythonpath=pypath) - self.assertRegex(str(exc).rstrip(), r"Exception: This should break the `scrapy list` command$") - - -@pytest.mark.parametrize( - ("versions", "expected"), - [ - (["zzz", "b", "ddd", "a", "x"], ["a", "b", "ddd", "x", "zzz"]), - (["10", "1", "9"], ["1", "9", "10"]), - (["2.11", "2.01", "2.9"], ["2.01", "2.9", "2.11"]), - ], -) -def test_sorted_versions(versions, expected): - assert sorted_versions(versions) == expected From 6a21c9ae9d8548309a41cd2ea4f75186fc57b37d Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 19 Jul 2024 02:18:00 -0400 Subject: [PATCH 2/6] chore(refactor): Move activate_egg function into caller --- scrapyd/eggutils.py | 22 ---------------------- scrapyd/runner.py | 20 +++++++++++++++++++- tests/test_dont_load_settings.py | 2 +- 3 files changed, 20 insertions(+), 24 deletions(-) delete mode 100644 scrapyd/eggutils.py diff --git a/scrapyd/eggutils.py b/scrapyd/eggutils.py deleted file mode 100644 index a1785d35..00000000 --- a/scrapyd/eggutils.py +++ /dev/null @@ -1,22 +0,0 @@ -import os - -import pkg_resources - -from scrapyd.exceptions import BadEggError - - -def activate_egg(eggpath): - """Activate a Scrapy egg file. This is meant to be used from egg runners - to activate a Scrapy egg file. Don't use it from other code as it may - leave unwanted side effects. - """ - distributions = pkg_resources.find_distributions(eggpath) - if isinstance(distributions, tuple): - raise BadEggError - try: - d = next(distributions) - except StopIteration: - raise BadEggError from None - d.activate() - settings_module = d.get_entry_info("scrapy", "settings").module_name - os.environ.setdefault("SCRAPY_SETTINGS_MODULE", settings_module) diff --git a/scrapyd/runner.py b/scrapyd/runner.py index 58e5661b..0ffe3f08 100644 --- a/scrapyd/runner.py +++ b/scrapyd/runner.py @@ -3,10 +3,28 @@ import tempfile from contextlib import contextmanager +import pkg_resources from scrapy.utils.misc import load_object from scrapyd import Config -from scrapyd.eggutils import activate_egg +from scrapyd.exceptions import BadEggError + + +def activate_egg(eggpath): + """Activate a Scrapy egg file. This is meant to be used from egg runners + to activate a Scrapy egg file. Don't use it from other code as it may + leave unwanted side effects. + """ + distributions = pkg_resources.find_distributions(eggpath) + if isinstance(distributions, tuple): + raise BadEggError + try: + d = next(distributions) + except StopIteration: + raise BadEggError from None + d.activate() + settings_module = d.get_entry_info("scrapy", "settings").module_name + os.environ.setdefault("SCRAPY_SETTINGS_MODULE", settings_module) @contextmanager diff --git a/tests/test_dont_load_settings.py b/tests/test_dont_load_settings.py index 8ea3c2db..27dba0ba 100644 --- a/tests/test_dont_load_settings.py +++ b/tests/test_dont_load_settings.py @@ -8,7 +8,7 @@ class SettingsSafeModulesTest(unittest.TestCase): "scrapy.utils.project", "scrapy.utils.conf", "scrapyd.interfaces", - "scrapyd.eggutils", + "scrapyd.runner", ) def test_modules_that_shouldnt_load_settings(self): From 2179bea91cd427328467a0420a77993b80fbd757 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 19 Jul 2024 02:23:08 -0400 Subject: [PATCH 3/6] chore(refactor): Move JsonResource function into caller --- scrapyd/utils.py | 19 ------------------- scrapyd/webservice.py | 22 ++++++++++++++++++++-- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/scrapyd/utils.py b/scrapyd/utils.py index aaa6219f..385769c0 100644 --- a/scrapyd/utils.py +++ b/scrapyd/utils.py @@ -1,4 +1,3 @@ -import json import os import sys from subprocess import PIPE, Popen @@ -6,30 +5,12 @@ from urllib.parse import urlsplit from scrapy.utils.misc import load_object -from twisted.web import resource from scrapyd.config import Config from scrapyd.exceptions import RunnerError from scrapyd.sqlite import JsonSqliteDict -class JsonResource(resource.Resource): - json_encoder = json.JSONEncoder() - - def render(self, txrequest): - r = resource.Resource.render(self, txrequest) - return self.encode_object(r, txrequest) - - def encode_object(self, obj, txrequest): - r = "" if obj is None else self.json_encoder.encode(obj) + "\n" - txrequest.setHeader("Content-Type", "application/json") - txrequest.setHeader("Access-Control-Allow-Origin", "*") - txrequest.setHeader("Access-Control-Allow-Methods", "GET, POST, PATCH, PUT, DELETE") - txrequest.setHeader("Access-Control-Allow-Headers", " X-Requested-With") - txrequest.setHeader("Content-Length", str(len(r))) - return r - - class UtilsCache: # array of project name that need to be invalided invalid_cached_projects: ClassVar = [] diff --git a/scrapyd/webservice.py b/scrapyd/webservice.py index 8d695fa9..1ae850e1 100644 --- a/scrapyd/webservice.py +++ b/scrapyd/webservice.py @@ -1,6 +1,7 @@ from __future__ import annotations import functools +import json import sys import traceback import uuid @@ -9,11 +10,11 @@ from io import BytesIO from twisted.python import log -from twisted.web import error, http +from twisted.web import error, http, resource from scrapyd.exceptions import EggNotFoundError, ProjectNotFoundError from scrapyd.jobstorage import job_items_url, job_log_url -from scrapyd.utils import JsonResource, UtilsCache, get_spider_list, native_stringify_dict +from scrapyd.utils import UtilsCache, get_spider_list, native_stringify_dict def param( @@ -52,6 +53,23 @@ def wrapper(self, txrequest, *args, **kwargs): return decorator +class JsonResource(resource.Resource): + json_encoder = json.JSONEncoder() + + def render(self, txrequest): + r = resource.Resource.render(self, txrequest) + return self.encode_object(r, txrequest) + + def encode_object(self, obj, txrequest): + r = "" if obj is None else self.json_encoder.encode(obj) + "\n" + txrequest.setHeader("Content-Type", "application/json") + txrequest.setHeader("Access-Control-Allow-Origin", "*") + txrequest.setHeader("Access-Control-Allow-Methods", "GET, POST, PATCH, PUT, DELETE") + txrequest.setHeader("Access-Control-Allow-Headers", " X-Requested-With") + txrequest.setHeader("Content-Length", str(len(r))) + return r + + class WsResource(JsonResource): def __init__(self, root): JsonResource.__init__(self) From a059e8392d4b7ebae88f6d96a862d0931f16dab2 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 19 Jul 2024 02:27:19 -0400 Subject: [PATCH 4/6] chore(refactor): Move get_crawl_args function to caller --- scrapyd/launcher.py | 19 ++++++++++++++++++- scrapyd/utils.py | 25 ++++--------------------- tests/test_launcher.py | 21 +++++++++++++++++++++ 3 files changed, 43 insertions(+), 22 deletions(-) create mode 100644 tests/test_launcher.py diff --git a/scrapyd/launcher.py b/scrapyd/launcher.py index a871250a..204d2454 100644 --- a/scrapyd/launcher.py +++ b/scrapyd/launcher.py @@ -8,7 +8,24 @@ from scrapyd import __version__ from scrapyd.interfaces import IEnvironment, IJobStorage, IPoller -from scrapyd.utils import get_crawl_args, native_stringify_dict +from scrapyd.utils import native_stringify_dict, to_native_str + + +def get_crawl_args(message): + """Return the command-line arguments to use for the scrapy crawl process + that will be started for this message + """ + msg = message.copy() + args = [to_native_str(msg["_spider"])] + del msg["_project"], msg["_spider"] + settings = msg.pop("settings", {}) + for k, v in native_stringify_dict(msg, keys_only=False).items(): + args += ["-a"] + args += [f"{k}={v}"] + for k, v in native_stringify_dict(settings, keys_only=False).items(): + args += ["-s"] + args += [f"{k}={v}"] + return args class Launcher(Service): diff --git a/scrapyd/utils.py b/scrapyd/utils.py index 385769c0..59cafed3 100644 --- a/scrapyd/utils.py +++ b/scrapyd/utils.py @@ -76,36 +76,19 @@ def native_stringify_dict(dct_or_tuples, encoding="utf-8", *, keys_only=True): """ d = {} for k, v in dct_or_tuples.items(): - key = _to_native_str(k, encoding) + key = to_native_str(k, encoding) if keys_only: value = v elif isinstance(v, dict): value = native_stringify_dict(v, encoding=encoding, keys_only=keys_only) elif isinstance(v, list): - value = [_to_native_str(e, encoding) for e in v] + value = [to_native_str(e, encoding) for e in v] else: - value = _to_native_str(v, encoding) + value = to_native_str(v, encoding) d[key] = value return d -def get_crawl_args(message): - """Return the command-line arguments to use for the scrapy crawl process - that will be started for this message - """ - msg = message.copy() - args = [_to_native_str(msg["_spider"])] - del msg["_project"], msg["_spider"] - settings = msg.pop("settings", {}) - for k, v in native_stringify_dict(msg, keys_only=False).items(): - args += ["-a"] - args += [f"{k}={v}"] - for k, v in native_stringify_dict(settings, keys_only=False).items(): - args += ["-s"] - args += [f"{k}={v}"] - return args - - def get_spider_list(project, runner=None, pythonpath=None, version=None): """Return the spider list from the given project, using the given runner""" @@ -150,7 +133,7 @@ def get_spider_list(project, runner=None, pythonpath=None, version=None): return spiders -def _to_native_str(text, encoding="utf-8", errors="strict"): +def to_native_str(text, encoding="utf-8", errors="strict"): if isinstance(text, str): return text return text.decode(encoding, errors) diff --git a/tests/test_launcher.py b/tests/test_launcher.py new file mode 100644 index 00000000..c5f11f0c --- /dev/null +++ b/tests/test_launcher.py @@ -0,0 +1,21 @@ +from scrapyd.launcher import get_crawl_args + + +def test_get_crawl_args(): + msg = {"_project": "lolo", "_spider": "lala"} + + assert get_crawl_args(msg) == ["lala"] + + msg = {"_project": "lolo", "_spider": "lala", "arg1": "val1"} + cargs = get_crawl_args(msg) + + assert cargs == ["lala", "-a", "arg1=val1"] + assert all(isinstance(x, str) for x in cargs), cargs + + +def test_get_crawl_args_with_settings(): + msg = {"_project": "lolo", "_spider": "lala", "arg1": "val1", "settings": {"ONE": "two"}} + cargs = get_crawl_args(msg) + + assert cargs == ["lala", "-a", "arg1=val1", "-s", "ONE=two"] + assert all(isinstance(x, str) for x in cargs), cargs From 73872149220e6b766d80c4ac3a3213b2eb57c3f1 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 19 Jul 2024 02:38:56 -0400 Subject: [PATCH 5/6] chore(refactor): Add tests back --- tests/test_webservice.py | 702 ++++++++++++++++++++++----------------- 1 file changed, 402 insertions(+), 300 deletions(-) diff --git a/tests/test_webservice.py b/tests/test_webservice.py index 94f8f482..bd17b1d6 100644 --- a/tests/test_webservice.py +++ b/tests/test_webservice.py @@ -1,12 +1,19 @@ +import os +import re +from io import BytesIO from pathlib import Path +from pkgutil import get_data from unittest import mock import pytest +from scrapy.utils.test import get_pythonpath from twisted.web import error -from scrapyd.exceptions import DirectoryTraversalError +from scrapyd import get_application +from scrapyd.exceptions import DirectoryTraversalError, RunnerError from scrapyd.interfaces import IEggStorage from scrapyd.jobstorage import Job +from scrapyd.utils import UtilsCache, get_spider_list def fake_list_jobs(*args, **kwargs): @@ -21,307 +28,402 @@ def fake_list_spiders_other(*args, **kwargs): return ["quotesbot", "toscrape-css"] -class TestWebservice: - def add_test_version(self, root, basename, version): - egg_path = Path(__file__).absolute().parent / f"{basename}.egg" +def get_pythonpath_scrapyd(): + scrapyd_path = __import__("scrapyd").__path__[0] + return os.path.join(os.path.dirname(scrapyd_path), get_pythonpath(), os.environ.get("PYTHONPATH", "")) + + +@pytest.fixture() +def app(): + return get_application() + + +def add_test_version(app, file, project, version): + eggstorage = app.getComponent(IEggStorage) + eggfile = BytesIO(get_data("tests", file)) + eggstorage.put(eggfile, project, version) + + +def add_test_version_from_root(root, basename, version): + egg_path = Path(__file__).absolute().parent / f"{basename}.egg" + with open(egg_path, "rb") as f: + root.eggstorage.put(f, "myproject", version) + + +def test_get_spider_list_log_stdout(app): + add_test_version(app, "logstdout.egg", "logstdout", "logstdout") + spiders = get_spider_list("logstdout", pythonpath=get_pythonpath_scrapyd()) + + # If LOG_STDOUT were respected, the output would be []. + assert sorted(spiders) == ["spider1", "spider2"] + + +def test_get_spider_list(app): + # mybot.egg has two spiders, spider1 and spider2 + add_test_version(app, "mybot.egg", "mybot", "r1") + spiders = get_spider_list("mybot", pythonpath=get_pythonpath_scrapyd()) + assert sorted(spiders) == ["spider1", "spider2"] + + # mybot2.egg has three spiders, spider1, spider2 and spider3... + # BUT you won't see it here because it's cached. + # Effectivelly it's like if version was never added + add_test_version(app, "mybot2.egg", "mybot", "r2") + spiders = get_spider_list("mybot", pythonpath=get_pythonpath_scrapyd()) + assert sorted(spiders) == ["spider1", "spider2"] + + # Let's invalidate the cache for this project... + UtilsCache.invalid_cache("mybot") + + # Now you get the updated list + spiders = get_spider_list("mybot", pythonpath=get_pythonpath_scrapyd()) + assert sorted(spiders) == ["spider1", "spider2", "spider3"] + + # Let's re-deploy mybot.egg and clear cache. It now sees 2 spiders + add_test_version(app, "mybot.egg", "mybot", "r3") + UtilsCache.invalid_cache("mybot") + spiders = get_spider_list("mybot", pythonpath=get_pythonpath_scrapyd()) + assert sorted(spiders) == ["spider1", "spider2"] + + # And re-deploying the one with three (mybot2.egg) with a version that + # isn't the higher, won't change what get_spider_list() returns. + add_test_version(app, "mybot2.egg", "mybot", "r1a") + UtilsCache.invalid_cache("mybot") + spiders = get_spider_list("mybot", pythonpath=get_pythonpath_scrapyd()) + assert sorted(spiders) == ["spider1", "spider2"] + + +@pytest.mark.skipif(os.name == "nt", reason="get_spider_list() unicode fails on windows") +def test_get_spider_list_unicode(app): + # mybotunicode.egg has two spiders, araña1 and araña2 + add_test_version(app, "mybotunicode.egg", "mybotunicode", "r1") + spiders = get_spider_list("mybotunicode", pythonpath=get_pythonpath_scrapyd()) + + assert sorted(spiders) == ["araña1", "araña2"] + + +def test_failed_spider_list(app): + add_test_version(app, "mybot3.egg", "mybot3", "r1") + with pytest.raises(RunnerError) as exc: + get_spider_list("mybot3", pythonpath=get_pythonpath_scrapyd()) + + assert re.search(r"Exception: This should break the `scrapy list` command$", str(exc.value)) + + +def test_list_spiders(txrequest, site_no_egg): + add_test_version_from_root(site_no_egg, "mybot", "r1") + add_test_version_from_root(site_no_egg, "mybot2", "r2") + + txrequest.args = {b"project": [b"myproject"]} + endpoint = b"listspiders.json" + content = site_no_egg.children[endpoint].render_GET(txrequest) + + assert content["spiders"] == ["spider1", "spider2", "spider3"] + assert content["status"] == "ok" + + +def test_list_spiders_nonexistent(txrequest, site_no_egg): + txrequest.args = { + b"project": [b"nonexistent"], + } + endpoint = b"listspiders.json" + + with pytest.raises(error.Error) as exc: + site_no_egg.children[endpoint].render_GET(txrequest) + + assert exc.value.status == b"200" + assert exc.value.message == b"project 'nonexistent' not found" + + +def test_list_spiders_version(txrequest, site_no_egg): + add_test_version_from_root(site_no_egg, "mybot", "r1") + add_test_version_from_root(site_no_egg, "mybot2", "r2") + + txrequest.args = { + b"project": [b"myproject"], + b"_version": [b"r1"], + } + endpoint = b"listspiders.json" + content = site_no_egg.children[endpoint].render_GET(txrequest) + + assert content["spiders"] == ["spider1", "spider2"] + assert content["status"] == "ok" + + +def test_list_spiders_version_nonexistent(txrequest, site_no_egg): + add_test_version_from_root(site_no_egg, "mybot", "r1") + add_test_version_from_root(site_no_egg, "mybot2", "r2") + + txrequest.args = { + b"project": [b"myproject"], + b"_version": [b"nonexistent"], + } + endpoint = b"listspiders.json" + + with pytest.raises(error.Error) as exc: + site_no_egg.children[endpoint].render_GET(txrequest) + + assert exc.value.status == b"200" + assert exc.value.message == b"version 'nonexistent' not found" + + +def test_list_versions(txrequest, site_with_egg): + txrequest.args = { + b"project": [b"quotesbot"], + } + endpoint = b"listversions.json" + content = site_with_egg.children[endpoint].render_GET(txrequest) + + assert content["versions"] == ["0_1"] + assert content["status"] == "ok" + + +def test_list_versions_nonexistent(txrequest, site_no_egg): + txrequest.args = { + b"project": [b"quotesbot"], + } + endpoint = b"listversions.json" + content = site_no_egg.children[endpoint].render_GET(txrequest) + + assert content["versions"] == [] + assert content["status"] == "ok" + + +def test_list_projects(txrequest, site_with_egg): + txrequest.args = {b"project": [b"quotesbot"], b"spider": [b"toscrape-css"]} + endpoint = b"listprojects.json" + content = site_with_egg.children[endpoint].render_GET(txrequest) + + assert content["projects"] == ["quotesbot"] + + +def test_list_jobs(txrequest, site_with_egg): + txrequest.args = {} + endpoint = b"listjobs.json" + content = site_with_egg.children[endpoint].render_GET(txrequest) + + assert set(content) == {"node_name", "status", "pending", "running", "finished"} + + +@mock.patch("scrapyd.jobstorage.MemoryJobStorage.__iter__", new=fake_list_jobs) +def test_list_jobs_finished(txrequest, site_with_egg): + txrequest.args = {} + endpoint = b"listjobs.json" + content = site_with_egg.children[endpoint].render_GET(txrequest) + + assert set(content["finished"][0]) == { + "project", + "spider", + "id", + "start_time", + "end_time", + "log_url", + "items_url", + } + + +def test_delete_version(txrequest, site_with_egg): + endpoint = b"delversion.json" + txrequest.args = {b"project": [b"quotesbot"], b"version": [b"0.1"]} + + storage = site_with_egg.app.getComponent(IEggStorage) + version, egg = storage.get("quotesbot") + if egg: + egg.close() + + content = site_with_egg.children[endpoint].render_POST(txrequest) + no_version, no_egg = storage.get("quotesbot") + if no_egg: + no_egg.close() + + assert version is not None + assert content["status"] == "ok" + assert "node_name" in content + assert no_version is None + + +def test_delete_version_nonexistent_project(txrequest, site_with_egg): + endpoint = b"delversion.json" + txrequest.args = {b"project": [b"quotesbot"], b"version": [b"nonexistent"]} + + with pytest.raises(error.Error) as exc: + site_with_egg.children[endpoint].render_POST(txrequest) + + assert exc.value.status == b"200" + assert exc.value.message == b"version 'nonexistent' not found" + + +def test_delete_version_nonexistent_version(txrequest, site_no_egg): + endpoint = b"delversion.json" + txrequest.args = {b"project": [b"nonexistent"], b"version": [b"0.1"]} + + with pytest.raises(error.Error) as exc: + site_no_egg.children[endpoint].render_POST(txrequest) + + assert exc.value.status == b"200" + assert exc.value.message == b"version '0.1' not found" + + +def test_delete_project(txrequest, site_with_egg): + endpoint = b"delproject.json" + txrequest.args = { + b"project": [b"quotesbot"], + } + + storage = site_with_egg.app.getComponent(IEggStorage) + version, egg = storage.get("quotesbot") + if egg: + egg.close() + + content = site_with_egg.children[endpoint].render_POST(txrequest) + no_version, no_egg = storage.get("quotesbot") + if no_egg: + no_egg.close() + + assert version is not None + assert content["status"] == "ok" + assert "node_name" in content + assert no_version is None + + +def test_delete_project_nonexistent(txrequest, site_no_egg): + endpoint = b"delproject.json" + txrequest.args = { + b"project": [b"nonexistent"], + } + + with pytest.raises(error.Error) as exc: + site_no_egg.children[endpoint].render_POST(txrequest) + + assert exc.value.status == b"200" + assert exc.value.message == b"project 'nonexistent' not found" + + +def test_addversion(txrequest, site_no_egg): + endpoint = b"addversion.json" + txrequest.args = {b"project": [b"quotesbot"], b"version": [b"0.1"]} + egg_path = Path(__file__).absolute().parent / "quotesbot.egg" + with open(egg_path, "rb") as f: + txrequest.args[b"egg"] = [f.read()] + + storage = site_no_egg.app.getComponent(IEggStorage) + version, egg = storage.get("quotesbot") + if egg: + egg.close() + + content = site_no_egg.children[endpoint].render_POST(txrequest) + no_version, no_egg = storage.get("quotesbot") + if no_egg: + no_egg.close() + + assert version is None + assert content["status"] == "ok" + assert "node_name" in content + assert no_version == "0_1" + + +def test_schedule(txrequest, site_with_egg): + endpoint = b"schedule.json" + txrequest.args = {b"project": [b"quotesbot"], b"spider": [b"toscrape-css"]} + + content = site_with_egg.children[endpoint].render_POST(txrequest) + + assert site_with_egg.scheduler.calls == [["quotesbot", "toscrape-css"]] + assert content["status"] == "ok" + assert "jobid" in content + + +def test_schedule_nonexistent_project(txrequest, site_no_egg): + endpoint = b"schedule.json" + txrequest.args = {b"project": [b"nonexistent"], b"spider": [b"toscrape-css"]} + + with pytest.raises(error.Error) as exc: + site_no_egg.children[endpoint].render_POST(txrequest) + + assert exc.value.status == b"200" + assert exc.value.message == b"project 'nonexistent' not found" + + +def test_schedule_nonexistent_version(txrequest, site_with_egg): + endpoint = b"schedule.json" + txrequest.args = {b"project": [b"quotesbot"], b"_version": [b"nonexistent"], b"spider": [b"toscrape-css"]} + + with pytest.raises(error.Error) as exc: + site_with_egg.children[endpoint].render_POST(txrequest) + + assert exc.value.status == b"200" + assert exc.value.message == b"version 'nonexistent' not found" + + +def test_schedule_nonexistent_spider(txrequest, site_with_egg): + endpoint = b"schedule.json" + txrequest.args = {b"project": [b"quotesbot"], b"spider": [b"nonexistent"]} + + with pytest.raises(error.Error) as exc: + site_with_egg.children[endpoint].render_POST(txrequest) + + assert exc.value.status == b"200" + assert exc.value.message == b"spider 'nonexistent' not found" + + +@pytest.mark.parametrize( + ("endpoint", "attach_egg", "method"), + [ + (b"addversion.json", True, "render_POST"), + (b"listversions.json", False, "render_GET"), + (b"delproject.json", False, "render_POST"), + (b"delversion.json", False, "render_POST"), + ], +) +def test_project_directory_traversal(txrequest, site_no_egg, endpoint, attach_egg, method): + txrequest.args = { + b"project": [b"../p"], + b"version": [b"0.1"], + } + + if attach_egg: + egg_path = Path(__file__).absolute().parent / "quotesbot.egg" with open(egg_path, "rb") as f: - root.eggstorage.put(f, "myproject", version) - - def test_list_spiders(self, txrequest, site_no_egg): - self.add_test_version(site_no_egg, "mybot", "r1") - self.add_test_version(site_no_egg, "mybot2", "r2") - - txrequest.args = {b"project": [b"myproject"]} - endpoint = b"listspiders.json" - content = site_no_egg.children[endpoint].render_GET(txrequest) - - assert content["spiders"] == ["spider1", "spider2", "spider3"] - assert content["status"] == "ok" - - def test_list_spiders_nonexistent(self, txrequest, site_no_egg): - txrequest.args = { - b"project": [b"nonexistent"], - } - endpoint = b"listspiders.json" - - with pytest.raises(error.Error) as exc: - site_no_egg.children[endpoint].render_GET(txrequest) - - assert exc.value.status == b"200" - assert exc.value.message == b"project 'nonexistent' not found" - - def test_list_spiders_version(self, txrequest, site_no_egg): - self.add_test_version(site_no_egg, "mybot", "r1") - self.add_test_version(site_no_egg, "mybot2", "r2") - - txrequest.args = { - b"project": [b"myproject"], - b"_version": [b"r1"], - } - endpoint = b"listspiders.json" - content = site_no_egg.children[endpoint].render_GET(txrequest) - - assert content["spiders"] == ["spider1", "spider2"] - assert content["status"] == "ok" - - def test_list_spiders_version_nonexistent(self, txrequest, site_no_egg): - self.add_test_version(site_no_egg, "mybot", "r1") - self.add_test_version(site_no_egg, "mybot2", "r2") - - txrequest.args = { - b"project": [b"myproject"], - b"_version": [b"nonexistent"], - } - endpoint = b"listspiders.json" - - with pytest.raises(error.Error) as exc: - site_no_egg.children[endpoint].render_GET(txrequest) - - assert exc.value.status == b"200" - assert exc.value.message == b"version 'nonexistent' not found" - - def test_list_versions(self, txrequest, site_with_egg): - txrequest.args = { - b"project": [b"quotesbot"], - } - endpoint = b"listversions.json" - content = site_with_egg.children[endpoint].render_GET(txrequest) - - assert content["versions"] == ["0_1"] - assert content["status"] == "ok" - - def test_list_versions_nonexistent(self, txrequest, site_no_egg): - txrequest.args = { - b"project": [b"quotesbot"], - } - endpoint = b"listversions.json" - content = site_no_egg.children[endpoint].render_GET(txrequest) - - assert content["versions"] == [] - assert content["status"] == "ok" - - def test_list_projects(self, txrequest, site_with_egg): - txrequest.args = {b"project": [b"quotesbot"], b"spider": [b"toscrape-css"]} - endpoint = b"listprojects.json" - content = site_with_egg.children[endpoint].render_GET(txrequest) - - assert content["projects"] == ["quotesbot"] - - def test_list_jobs(self, txrequest, site_with_egg): - txrequest.args = {} - endpoint = b"listjobs.json" - content = site_with_egg.children[endpoint].render_GET(txrequest) - - assert set(content) == {"node_name", "status", "pending", "running", "finished"} - - @mock.patch("scrapyd.jobstorage.MemoryJobStorage.__iter__", new=fake_list_jobs) - def test_list_jobs_finished(self, txrequest, site_with_egg): - txrequest.args = {} - endpoint = b"listjobs.json" - content = site_with_egg.children[endpoint].render_GET(txrequest) - - assert set(content["finished"][0]) == { - "project", - "spider", - "id", - "start_time", - "end_time", - "log_url", - "items_url", - } - - def test_delete_version(self, txrequest, site_with_egg): - endpoint = b"delversion.json" - txrequest.args = {b"project": [b"quotesbot"], b"version": [b"0.1"]} - - storage = site_with_egg.app.getComponent(IEggStorage) - version, egg = storage.get("quotesbot") - if egg: - egg.close() - - content = site_with_egg.children[endpoint].render_POST(txrequest) - no_version, no_egg = storage.get("quotesbot") - if no_egg: - no_egg.close() - - assert version is not None - assert content["status"] == "ok" - assert "node_name" in content - assert no_version is None - - def test_delete_version_nonexistent_project(self, txrequest, site_with_egg): - endpoint = b"delversion.json" - txrequest.args = {b"project": [b"quotesbot"], b"version": [b"nonexistent"]} - - with pytest.raises(error.Error) as exc: - site_with_egg.children[endpoint].render_POST(txrequest) - - assert exc.value.status == b"200" - assert exc.value.message == b"version 'nonexistent' not found" - - def test_delete_version_nonexistent_version(self, txrequest, site_no_egg): - endpoint = b"delversion.json" - txrequest.args = {b"project": [b"nonexistent"], b"version": [b"0.1"]} - - with pytest.raises(error.Error) as exc: - site_no_egg.children[endpoint].render_POST(txrequest) - - assert exc.value.status == b"200" - assert exc.value.message == b"version '0.1' not found" - - def test_delete_project(self, txrequest, site_with_egg): - endpoint = b"delproject.json" - txrequest.args = { - b"project": [b"quotesbot"], - } - - storage = site_with_egg.app.getComponent(IEggStorage) - version, egg = storage.get("quotesbot") - if egg: - egg.close() - - content = site_with_egg.children[endpoint].render_POST(txrequest) - no_version, no_egg = storage.get("quotesbot") - if no_egg: - no_egg.close() - - assert version is not None - assert content["status"] == "ok" - assert "node_name" in content - assert no_version is None - - def test_delete_project_nonexistent(self, txrequest, site_no_egg): - endpoint = b"delproject.json" - txrequest.args = { - b"project": [b"nonexistent"], - } - - with pytest.raises(error.Error) as exc: - site_no_egg.children[endpoint].render_POST(txrequest) - - assert exc.value.status == b"200" - assert exc.value.message == b"project 'nonexistent' not found" - - def test_addversion(self, txrequest, site_no_egg): - endpoint = b"addversion.json" - txrequest.args = {b"project": [b"quotesbot"], b"version": [b"0.1"]} + txrequest.args[b"egg"] = [f.read()] + + with pytest.raises(DirectoryTraversalError) as exc: + getattr(site_no_egg.children[endpoint], method)(txrequest) + + assert str(exc.value) == "../p" + + storage = site_no_egg.app.getComponent(IEggStorage) + version, egg = storage.get("quotesbot") + if egg: + egg.close() + + assert version is None + + +@pytest.mark.parametrize( + ("endpoint", "attach_egg", "method"), + [ + (b"schedule.json", False, "render_POST"), + (b"listspiders.json", False, "render_GET"), + ], +) +def test_project_directory_traversal_runner(txrequest, site_no_egg, endpoint, attach_egg, method): + txrequest.args = { + b"project": [b"../p"], + b"spider": [b"s"], + } + + if attach_egg: egg_path = Path(__file__).absolute().parent / "quotesbot.egg" with open(egg_path, "rb") as f: txrequest.args[b"egg"] = [f.read()] - storage = site_no_egg.app.getComponent(IEggStorage) - version, egg = storage.get("quotesbot") - if egg: - egg.close() - - content = site_no_egg.children[endpoint].render_POST(txrequest) - no_version, no_egg = storage.get("quotesbot") - if no_egg: - no_egg.close() - - assert version is None - assert content["status"] == "ok" - assert "node_name" in content - assert no_version == "0_1" - - def test_schedule(self, txrequest, site_with_egg): - endpoint = b"schedule.json" - txrequest.args = {b"project": [b"quotesbot"], b"spider": [b"toscrape-css"]} - - content = site_with_egg.children[endpoint].render_POST(txrequest) - - assert site_with_egg.scheduler.calls == [["quotesbot", "toscrape-css"]] - assert content["status"] == "ok" - assert "jobid" in content - - def test_schedule_nonexistent_project(self, txrequest, site_no_egg): - endpoint = b"schedule.json" - txrequest.args = {b"project": [b"nonexistent"], b"spider": [b"toscrape-css"]} - - with pytest.raises(error.Error) as exc: - site_no_egg.children[endpoint].render_POST(txrequest) - - assert exc.value.status == b"200" - assert exc.value.message == b"project 'nonexistent' not found" - - def test_schedule_nonexistent_version(self, txrequest, site_with_egg): - endpoint = b"schedule.json" - txrequest.args = {b"project": [b"quotesbot"], b"_version": [b"nonexistent"], b"spider": [b"toscrape-css"]} - - with pytest.raises(error.Error) as exc: - site_with_egg.children[endpoint].render_POST(txrequest) - - assert exc.value.status == b"200" - assert exc.value.message == b"version 'nonexistent' not found" - - def test_schedule_nonexistent_spider(self, txrequest, site_with_egg): - endpoint = b"schedule.json" - txrequest.args = {b"project": [b"quotesbot"], b"spider": [b"nonexistent"]} - - with pytest.raises(error.Error) as exc: - site_with_egg.children[endpoint].render_POST(txrequest) - - assert exc.value.status == b"200" - assert exc.value.message == b"spider 'nonexistent' not found" - - @pytest.mark.parametrize( - ("endpoint", "attach_egg", "method"), - [ - (b"addversion.json", True, "render_POST"), - (b"listversions.json", False, "render_GET"), - (b"delproject.json", False, "render_POST"), - (b"delversion.json", False, "render_POST"), - ], - ) - def test_project_directory_traversal(self, txrequest, site_no_egg, endpoint, attach_egg, method): - txrequest.args = { - b"project": [b"../p"], - b"version": [b"0.1"], - } - - if attach_egg: - egg_path = Path(__file__).absolute().parent / "quotesbot.egg" - with open(egg_path, "rb") as f: - txrequest.args[b"egg"] = [f.read()] - - with pytest.raises(DirectoryTraversalError) as exc: - getattr(site_no_egg.children[endpoint], method)(txrequest) - - assert str(exc.value) == "../p" - - storage = site_no_egg.app.getComponent(IEggStorage) - version, egg = storage.get("quotesbot") - if egg: - egg.close() - - assert version is None - - @pytest.mark.parametrize( - ("endpoint", "attach_egg", "method"), - [ - (b"schedule.json", False, "render_POST"), - (b"listspiders.json", False, "render_GET"), - ], - ) - def test_project_directory_traversal_runner(self, txrequest, site_no_egg, endpoint, attach_egg, method): - txrequest.args = { - b"project": [b"../p"], - b"spider": [b"s"], - } - - if attach_egg: - egg_path = Path(__file__).absolute().parent / "quotesbot.egg" - with open(egg_path, "rb") as f: - txrequest.args[b"egg"] = [f.read()] - - with pytest.raises(DirectoryTraversalError) as exc: - getattr(site_no_egg.children[endpoint], method)(txrequest) - - assert str(exc.value) == "../p" - - storage = site_no_egg.app.getComponent(IEggStorage) - version, egg = storage.get("quotesbot") - if egg: - egg.close() - - assert version is None + with pytest.raises(DirectoryTraversalError) as exc: + getattr(site_no_egg.children[endpoint], method)(txrequest) + + assert str(exc.value) == "../p" + + storage = site_no_egg.app.getComponent(IEggStorage) + version, egg = storage.get("quotesbot") + if egg: + egg.close() + + assert version is None From 8407b0f1d35d8fbeb89587964bba8015ca90de12 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 19 Jul 2024 03:01:31 -0400 Subject: [PATCH 6/6] chore(refactor): Move get_spider_list function into caller --- scrapyd/utils.py | 77 --------------------------------------- scrapyd/webservice.py | 79 +++++++++++++++++++++++++++++++++++++++- tests/test_webservice.py | 4 +- 3 files changed, 79 insertions(+), 81 deletions(-) diff --git a/scrapyd/utils.py b/scrapyd/utils.py index 59cafed3..e3486ba4 100644 --- a/scrapyd/utils.py +++ b/scrapyd/utils.py @@ -1,41 +1,8 @@ import os -import sys -from subprocess import PIPE, Popen -from typing import ClassVar from urllib.parse import urlsplit from scrapy.utils.misc import load_object -from scrapyd.config import Config -from scrapyd.exceptions import RunnerError -from scrapyd.sqlite import JsonSqliteDict - - -class UtilsCache: - # array of project name that need to be invalided - invalid_cached_projects: ClassVar = [] - - def __init__(self): - self.cache_manager = JsonSqliteDict(table="utils_cache_manager") - - # Invalid the spider's list's cache of a given project (by name) - @staticmethod - def invalid_cache(project): - UtilsCache.invalid_cached_projects.append(project) - - def __getitem__(self, key): - for p in UtilsCache.invalid_cached_projects: - if p in self.cache_manager: - del self.cache_manager[p] - UtilsCache.invalid_cached_projects[:] = [] - return self.cache_manager[key] - - def __setitem__(self, key, value): - self.cache_manager[key] = value - - def __repr__(self): - return f"UtilsCache(cache_manager={self.cache_manager!r})" - def get_spider_queues(config): """Return a dict of Spider Queues keyed by project name""" @@ -89,50 +56,6 @@ def native_stringify_dict(dct_or_tuples, encoding="utf-8", *, keys_only=True): return d -def get_spider_list(project, runner=None, pythonpath=None, version=None): - """Return the spider list from the given project, using the given runner""" - - # UtilsCache uses JsonSqliteDict, which encodes the project's value as JSON, but JSON allows only string keys, - # so the stored dict will have a "null" key, instead of a None key. - if version is None: - version = "" - - if "cache" not in get_spider_list.__dict__: - get_spider_list.cache = UtilsCache() - try: - return get_spider_list.cache[project][version] - except KeyError: - pass - - if runner is None: - runner = Config().get("runner") - - env = os.environ.copy() - env["PYTHONIOENCODING"] = "UTF-8" - env["SCRAPY_PROJECT"] = project - if pythonpath: - env["PYTHONPATH"] = pythonpath - if version: - env["SCRAPYD_EGG_VERSION"] = version - pargs = [sys.executable, "-m", runner, "list", "-s", "LOG_STDOUT=0"] - proc = Popen(pargs, stdout=PIPE, stderr=PIPE, env=env) - out, err = proc.communicate() - if proc.returncode: - msg = err or out or "" - msg = msg.decode("utf8") - raise RunnerError(msg) - - spiders = out.decode("utf-8").splitlines() - try: - project_cache = get_spider_list.cache[project] - project_cache[version] = spiders - except KeyError: - project_cache = {version: spiders} - get_spider_list.cache[project] = project_cache - - return spiders - - def to_native_str(text, encoding="utf-8", errors="strict"): if isinstance(text, str): return text diff --git a/scrapyd/webservice.py b/scrapyd/webservice.py index 1ae850e1..1a5eb505 100644 --- a/scrapyd/webservice.py +++ b/scrapyd/webservice.py @@ -2,19 +2,24 @@ import functools import json +import os import sys import traceback import uuid import zipfile from copy import copy from io import BytesIO +from subprocess import PIPE, Popen +from typing import ClassVar from twisted.python import log from twisted.web import error, http, resource -from scrapyd.exceptions import EggNotFoundError, ProjectNotFoundError +from scrapyd.config import Config +from scrapyd.exceptions import EggNotFoundError, ProjectNotFoundError, RunnerError from scrapyd.jobstorage import job_items_url, job_log_url -from scrapyd.utils import UtilsCache, get_spider_list, native_stringify_dict +from scrapyd.sqlite import JsonSqliteDict +from scrapyd.utils import native_stringify_dict def param( @@ -53,6 +58,76 @@ def wrapper(self, txrequest, *args, **kwargs): return decorator +def get_spider_list(project, runner=None, pythonpath=None, version=None): + """Return the spider list from the given project, using the given runner""" + + # UtilsCache uses JsonSqliteDict, which encodes the project's value as JSON, but JSON allows only string keys, + # so the stored dict will have a "null" key, instead of a None key. + if version is None: + version = "" + + if "cache" not in get_spider_list.__dict__: + get_spider_list.cache = UtilsCache() + try: + return get_spider_list.cache[project][version] + except KeyError: + pass + + if runner is None: + runner = Config().get("runner") + + env = os.environ.copy() + env["PYTHONIOENCODING"] = "UTF-8" + env["SCRAPY_PROJECT"] = project + if pythonpath: + env["PYTHONPATH"] = pythonpath + if version: + env["SCRAPYD_EGG_VERSION"] = version + pargs = [sys.executable, "-m", runner, "list", "-s", "LOG_STDOUT=0"] + proc = Popen(pargs, stdout=PIPE, stderr=PIPE, env=env) + out, err = proc.communicate() + if proc.returncode: + msg = err or out or "" + msg = msg.decode("utf8") + raise RunnerError(msg) + + spiders = out.decode("utf-8").splitlines() + try: + project_cache = get_spider_list.cache[project] + project_cache[version] = spiders + except KeyError: + project_cache = {version: spiders} + get_spider_list.cache[project] = project_cache + + return spiders + + +class UtilsCache: + # array of project name that need to be invalided + invalid_cached_projects: ClassVar = [] + + def __init__(self): + self.cache_manager = JsonSqliteDict(table="utils_cache_manager") + + # Invalid the spider's list's cache of a given project (by name) + @staticmethod + def invalid_cache(project): + UtilsCache.invalid_cached_projects.append(project) + + def __getitem__(self, key): + for p in UtilsCache.invalid_cached_projects: + if p in self.cache_manager: + del self.cache_manager[p] + UtilsCache.invalid_cached_projects[:] = [] + return self.cache_manager[key] + + def __setitem__(self, key, value): + self.cache_manager[key] = value + + def __repr__(self): + return f"UtilsCache(cache_manager={self.cache_manager!r})" + + class JsonResource(resource.Resource): json_encoder = json.JSONEncoder() diff --git a/tests/test_webservice.py b/tests/test_webservice.py index bd17b1d6..7255610d 100644 --- a/tests/test_webservice.py +++ b/tests/test_webservice.py @@ -13,7 +13,7 @@ from scrapyd.exceptions import DirectoryTraversalError, RunnerError from scrapyd.interfaces import IEggStorage from scrapyd.jobstorage import Job -from scrapyd.utils import UtilsCache, get_spider_list +from scrapyd.webservice import UtilsCache, get_spider_list def fake_list_jobs(*args, **kwargs): @@ -106,7 +106,7 @@ def test_failed_spider_list(app): with pytest.raises(RunnerError) as exc: get_spider_list("mybot3", pythonpath=get_pythonpath_scrapyd()) - assert re.search(r"Exception: This should break the `scrapy list` command$", str(exc.value)) + assert re.search(f"Exception: This should break the `scrapy list` command{os.linesep}$", str(exc.value)) def test_list_spiders(txrequest, site_no_egg):