Skip to content

Commit

Permalink
feat: Use ScrapyProcessProtocol instead of Job (from #359)
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Jul 25, 2024
1 parent 9451b71 commit e3b51ee
Show file tree
Hide file tree
Showing 11 changed files with 74 additions and 82 deletions.
5 changes: 3 additions & 2 deletions docs/news.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ Web UI
API
^^^

- The ``Access-Control-Allow-Methods`` response header contains only the HTTP methods to which webservices respond.
- Clarify error messages, for example:

- ``'project' parameter is required``, instead of ``'project'`` (KeyError)
Expand Down Expand Up @@ -79,6 +78,7 @@ Library
- ``sorted_versions`` to ``scrapyd.eggstorage``
- ``get_crawl_args`` to ``scrapyd.launcher``

- :ref:`jobstorage` uses the ``ScrapyProcessProtocol`` class, by default. If :ref:`jobstorage` is set to ``scrapyd.jobstorage.SqliteJobStorage``, Scrapyd 1.3.0 uses a ``Job`` class, instead.
- Move the ``activate_egg`` function from the ``scrapyd.eggutils`` module to its caller, the ``scrapyd.runner`` module.
- Move the ``job_items_url`` and ``job_log_url`` functions from the ``scrapyd.jobstorage`` module to the ``scrapyd.utils`` module. :ref:`jobstorage` is not responsible for URLs.
- Change the ``get_crawl_args`` function to no longer convert ``bytes`` to ``str``, as already done by its caller.
Expand All @@ -100,7 +100,8 @@ Fixed
API
^^^

- The Content-Length header counts the number of bytes, instead of the number of characters.
- The ``Content-Length`` header counts the number of bytes, instead of the number of characters.
- The ``Access-Control-Allow-Methods`` response header contains only the HTTP methods to which webservices respond.
- The :ref:`schedule.json` webservice sets the ``node_name`` field in error responses.
- The next pending job for all but one project was unreported by the :ref:`daemonstatus.json` and :ref:`listjobs.json` webservices, and was not cancellable by the :ref:`cancel.json` webservice.

Expand Down
36 changes: 6 additions & 30 deletions scrapyd/jobstorage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,38 +3,11 @@
Job storage was previously in-memory only and managed by the launcher.
"""

import datetime

from zope.interface import implementer

from scrapyd import sqlite
from scrapyd.interfaces import IJobStorage


class Job:
def __init__(self, project, spider, job=None, start_time=None, end_time=None):
self.project = project
self.spider = spider
self.job = job
self.start_time = start_time if start_time else datetime.datetime.now()
self.end_time = end_time if end_time else datetime.datetime.now()

# For equality assertions in tests.
def __eq__(self, other):
return (
self.project == other.project
and self.spider == other.spider
and self.job == other.job
and self.start_time == other.start_time
and self.end_time == other.end_time
)

# For error messsages in tests.
def __repr__(self):
return (
f"Job(project={self.project}, spider={self.spider}, job={self.job}, "
f"start_time={self.start_time}, end_time={self.end_time})"
)
from scrapyd.launcher import ScrapyProcessProtocol


@implementer(IJobStorage)
Expand Down Expand Up @@ -74,5 +47,8 @@ def __len__(self):
return len(self.jobs)

def __iter__(self):
for project, spider, job, start_time, end_time in self.jobs:
yield Job(project=project, spider=spider, job=job, start_time=start_time, end_time=end_time)
for project, spider, jobid, start_time, end_time in self.jobs:
job = ScrapyProcessProtocol(project, spider, jobid, env={}, args=[])
job.start_time = start_time
job.end_time = end_time
yield job
21 changes: 17 additions & 4 deletions scrapyd/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,21 +88,34 @@ def _get_max_proc(self, config):
# https://docs.twisted.org/en/stable/api/twisted.internet.protocol.ProcessProtocol.html
class ScrapyProcessProtocol(protocol.ProcessProtocol):
def __init__(self, project, spider, job, env, args):
self.pid = None
self.project = project
self.spider = spider
self.job = job
self.pid = None
self.start_time = datetime.datetime.now()
self.end_time = None
self.env = env
self.args = args
self.env = env
self.deferred = defer.Deferred()

# For equality assertions in tests.
def __eq__(self, other):
return (
self.project == other.project
and self.spider == other.spider
and self.job == other.job
and self.pid == other.pid
and self.start_time == other.start_time
and self.end_time == other.end_time
and self.args == other.args
and self.env == other.env
)

# For error messsages in tests.
def __repr__(self):
return (
f"ScrapyProcessProtocol(pid={self.pid} project={self.project} spider={self.spider} job={self.job} "
f"start_time={self.start_time} end_time={self.end_time} env={self.env} args={self.args})"
f"ScrapyProcessProtocol(project={self.project} spider={self.spider} job={self.job} pid={self.pid} "
f"start_time={self.start_time} end_time={self.end_time} args={self.args} env={self.env})"
)

def outReceived(self, data):
Expand Down
2 changes: 2 additions & 0 deletions scrapyd/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def __init__(self, database, table):
def __len__(self):
return self.conn.execute(f"SELECT COUNT(*) FROM {self.table}").fetchone()[0]

# SQLite JSON is enabled by default since 3.38.0 (2022-02-22), and JSONB is available since 3.45.0 (2024-01-15).
# https://sqlite.org/json1.html
def encode(self, obj):
return sqlite3.Binary(json.dumps(obj).encode("ascii"))

Expand Down
14 changes: 14 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import datetime
import io
import os.path
import pkgutil

from twisted.logger import eventAsText

from scrapyd.launcher import ScrapyProcessProtocol


def get_egg_data(basename):
return pkgutil.get_data("tests", f"fixtures/{basename}.egg")
Expand All @@ -19,3 +22,14 @@ def root_add_version(root, project, version, basename):

def get_message(captured):
return eventAsText(captured[0]).split(" ", 1)[1]


def get_finished_job(project="p1", spider="s1", job="j1", start_time=None, end_time=None):
if start_time is None:
start_time = datetime.datetime.now()
if end_time is None:
end_time = datetime.datetime.now()
process = ScrapyProcessProtocol(project, spider, job, {}, [])
process.start_time = start_time
process.end_time = end_time
return process
16 changes: 0 additions & 16 deletions tests/test_job.py

This file was deleted.

9 changes: 5 additions & 4 deletions tests/test_jobstorage.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@

from scrapyd.config import Config
from scrapyd.interfaces import IJobStorage
from scrapyd.jobstorage import Job, MemoryJobStorage, SqliteJobStorage
from scrapyd.jobstorage import MemoryJobStorage, SqliteJobStorage
from tests import get_finished_job

job1 = Job("p1", "s1", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 7))
job2 = Job("p2", "s2", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 8))
job3 = Job("p3", "s3", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 9))
job1 = get_finished_job("p1", "s1", "j1", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 7))
job2 = get_finished_job("p2", "s2", "j2", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 8))
job3 = get_finished_job("p3", "s3", "j3", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 9))


def pytest_generate_tests(metafunc):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,4 +183,4 @@ def test_process_ended_terminated(environ, process):


def test_repr(process):
assert repr(process).startswith(f"ScrapyProcessProtocol(pid={process.pid} project=p1 spider=s1 job=j1 start_time=")
assert repr(process).startswith(f"ScrapyProcessProtocol(project=p1 spider=s1 job=j1 pid={process.pid} start_time=")
8 changes: 4 additions & 4 deletions tests/test_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import pytest

from scrapyd.jobstorage import Job
from scrapyd.sqlite import JsonSqlitePriorityQueue, SqliteFinishedJobs
from tests import get_finished_job


@pytest.fixture()
Expand All @@ -14,9 +14,9 @@ def jsonsqlitepriorityqueue():
@pytest.fixture()
def sqlitefinishedjobs():
q = SqliteFinishedJobs(":memory:")
q.add(Job("p1", "s1", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 7)))
q.add(Job("p2", "s2", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 8)))
q.add(Job("p3", "s3", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 9)))
q.add(get_finished_job("p1", "s1", "j1", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 7)))
q.add(get_finished_job("p2", "s2", "j2", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 8)))
q.add(get_finished_job("p3", "s3", "j3", end_time=datetime.datetime(2001, 2, 3, 4, 5, 6, 9)))
return q


Expand Down
27 changes: 14 additions & 13 deletions tests/test_webservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,19 @@
import os
import re
import sys
from unittest.mock import MagicMock, call
from unittest.mock import MagicMock, PropertyMock, call

import pytest
from twisted.logger import LogLevel, capturedLogs
from twisted.web import error

from scrapyd.exceptions import DirectoryTraversalError, RunnerError
from scrapyd.interfaces import IEggStorage
from scrapyd.jobstorage import Job
from scrapyd.launcher import ScrapyProcessProtocol
from scrapyd.webservice import spider_list
from tests import get_egg_data, get_message, has_settings, root_add_version
from tests import get_egg_data, get_finished_job, get_message, has_settings, root_add_version

job1 = Job(
job1 = get_finished_job(
project="p1",
spider="s1",
job="j1",
Expand All @@ -30,7 +29,9 @@
def scrapy_process():
process = ScrapyProcessProtocol(project="p1", spider="s1", job="j1", env={}, args=[])
process.start_time = datetime.datetime(2001, 2, 3, 4, 5, 6, 9)
process.end_time = datetime.datetime(2001, 2, 3, 4, 5, 6, 10)
process.transport = MagicMock()
type(process.transport).pid = PropertyMock(return_value=12345)
return process


Expand Down Expand Up @@ -290,8 +291,8 @@ def test_status(txrequest, root, scrapy_process, args):
root.update_projects()

if args:
root.launcher.finished.add(Job(project="p2", spider="s2", job="j1"))
root.launcher.processes[0] = ScrapyProcessProtocol("p2", "s2", "j1", {}, [])
root.launcher.finished.add(get_finished_job("p2", "s2", "j1"))
root.launcher.processes[0] = ScrapyProcessProtocol("p2", "s2", "j1", env={}, args=[])
root.poller.queues["p2"].add("s2", _job="j1")

expected = {"currstate": None}
Expand Down Expand Up @@ -325,8 +326,8 @@ def test_list_jobs(txrequest, root, scrapy_process, args):
root.update_projects()

if args:
root.launcher.finished.add(Job(project="p2", spider="s2", job="j2"))
root.launcher.processes[0] = ScrapyProcessProtocol("p2", "s2", "j2", {}, [])
root.launcher.finished.add(get_finished_job("p2", "s2", "j2"))
root.launcher.processes[0] = ScrapyProcessProtocol("p2", "s2", "j2", env={}, args=[])
root.poller.queues["p2"].add("s2", _job="j2")

expected = {"pending": [], "running": [], "finished": []}
Expand All @@ -336,9 +337,9 @@ def test_list_jobs(txrequest, root, scrapy_process, args):

expected["finished"].append(
{
"id": "j1",
"project": "p1",
"spider": "s1",
"id": "j1",
"start_time": "2001-02-03 04:05:06.000007",
"end_time": "2001-02-03 04:05:06.000008",
"items_url": "/items/p1/s1/j1.jl",
Expand All @@ -351,11 +352,11 @@ def test_list_jobs(txrequest, root, scrapy_process, args):

expected["running"].append(
{
"id": "j1",
"project": "p1",
"spider": "s1",
"start_time": "2001-02-03 04:05:06.000009",
"id": "j1",
"pid": None,
"start_time": "2001-02-03 04:05:06.000009",
}
)
assert_content(txrequest, root, "GET", "listjobs", args, expected)
Expand All @@ -371,9 +372,9 @@ def test_list_jobs(txrequest, root, scrapy_process, args):

expected["pending"].append(
{
"id": "j1",
"project": "p1",
"spider": "s1",
"id": "j1",
"version": "0.1",
"settings": {"DOWNLOAD_DELAY=2": "TRACK=Cause = Time"},
"args": {"other": "one"},
Expand Down Expand Up @@ -645,7 +646,7 @@ def test_cancel(txrequest, root, scrapy_process, args):

root.launcher.processes[0] = scrapy_process
root.launcher.processes[1] = scrapy_process
root.launcher.processes[2] = ScrapyProcessProtocol("p2", "s2", "j2", {}, [])
root.launcher.processes[2] = ScrapyProcessProtocol("p2", "s2", "j2", env={}, args=[])

expected["prevstate"] = "running"
assert_content(txrequest, root, "POST", "cancel", args, expected)
Expand Down
16 changes: 8 additions & 8 deletions tests/test_website.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
from twisted.web.test.requesthelper import DummyRequest

from scrapyd.app import application
from scrapyd.jobstorage import Job
from scrapyd.launcher import ScrapyProcessProtocol
from scrapyd.website import Root
from tests import has_settings, root_add_version
from tests import get_finished_job, has_settings, root_add_version


def assert_headers(txrequest):
Expand All @@ -33,7 +32,7 @@ def assert_hrefs(urls, text, header):

# Derived from test_emptyChildUnicodeParent.
# https://github.com/twisted/twisted/blob/trunk/src/twisted/web/test/test_static.py
def test_render_logs_dir(txrequest, root):
def test_logs_dir(txrequest, root):
os.makedirs(os.path.join("logs", "quotesbot"))

file = root.children[b"logs"]
Expand All @@ -49,7 +48,7 @@ def test_render_logs_dir(txrequest, root):

# Derived from test_indexNames.
# https://github.com/twisted/twisted/blob/trunk/src/twisted/web/test/test_static.py
def test_render_logs_file(txrequest, root):
def test_logs_file(txrequest, root):
os.makedirs(os.path.join("logs", "quotesbot"))
with open(os.path.join("logs", "foo.txt"), "wb") as f:
f.write(b"baz")
Expand All @@ -74,16 +73,16 @@ def cbRendered(ignored):

@pytest.mark.parametrize("cancel", [True, False], ids=["cancel", "no_cancel"])
@pytest.mark.parametrize("header", [True, False], ids=["header", "no_header"])
def test_render_jobs(txrequest, config, cancel, header):
def test_jobs(txrequest, config, cancel, header):
if not cancel:
config.cp.remove_option("services", "cancel.json")

root = Root(config, application(config))
root_add_version(root, "quotesbot", "0.1", "quotesbot")
root.update_projects()

root.launcher.finished.add(Job("p1", "s1", "j-finished"))
root.launcher.processes[0] = ScrapyProcessProtocol("p2", "s2", "j-running", {}, [])
root.launcher.finished.add(get_finished_job("p1", "s1", "j-finished"))
root.launcher.processes[0] = ScrapyProcessProtocol("p2", "s2", "j-running", env={}, args=[])
root.poller.queues["quotesbot"].add("quotesbot", _job="j-pending")

if header:
Expand Down Expand Up @@ -117,11 +116,12 @@ def test_render_jobs(txrequest, config, cancel, header):
else:
assert b"<th>Cancel</th>" not in content
assert b'/cancel.json">' not in content
assert b' value="j-finished">' not in content


@pytest.mark.parametrize("with_egg", [True, False])
@pytest.mark.parametrize("header", [True, False])
def test_render_home(txrequest, root, with_egg, header):
def test_home(txrequest, root, with_egg, header):
if with_egg:
root_add_version(root, "quotesbot", "0.1", "quotesbot")
root.update_projects()
Expand Down

0 comments on commit e3b51ee

Please sign in to comment.