Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add black & isort pre-commit hooks #41

Merged
merged 1 commit into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@
]
}
},
"postCreateCommand": "poetry install"
"postCreateCommand": "poetry install && poetry run pre-commit install"
}
14 changes: 14 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
repos:
- repo: local
hooks:
- id: black
name: black
entry: poetry run black
language: system
types: [python]

- id: isort
name: isort
entry: poetry run isort
language: system
types: [python]
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,13 +203,16 @@ pip install poetry

# Install dependencies
poetry install

# Set up pre-commit hooks to keep your code formatted
poetry run pre-commit install
```

Check out [Important commands](#important-commands) below for next steps.

### Developing using a GitHub Codespace

This project uses a custom Development Container supported by GitHub Codespaces. Creating a new Codespace automatically takes care of installing all supported Python interpreters, the Poetry package manager, and Python dependencies for you.
This project uses a custom Development Container supported by GitHub Codespaces. Creating a new Codespace automatically takes care of installing all supported Python interpreters, the Poetry package manager, Python dependencies, and pre-commit hooks for you.

To create a new Codespace:
1. Click on the `<> Code` dropdown on the GitHub UI.
Expand Down
24 changes: 14 additions & 10 deletions edgar_tool/cli.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
import sys
import time
from datetime import date, timedelta, datetime
from datetime import date, datetime, timedelta
from typing import List, Optional
from warnings import warn

from edgar_tool.constants import (
SUPPORTED_OUTPUT_EXTENSIONS,
TEXT_SEARCH_CATEGORY_FORM_GROUPINGS,
TEXT_SEARCH_FILING_VS_MAPPING_CATEGORIES_MAPPING,
)
from edgar_tool.page_fetcher import NoResultsFoundError
from edgar_tool.rss import fetch_rss_feed
from edgar_tool.text_search import EdgarTextSearcher
from edgar_tool.utils import parse_location_input
from edgar_tool.page_fetcher import NoResultsFoundError
GalenReich marked this conversation as resolved.
Show resolved Hide resolved


def _validate_text_search_args(
Expand Down Expand Up @@ -57,15 +58,16 @@ def _validate_text_search_args(
):
raise ValueError(
f"Filing form group must be one of: {'; '.join(TEXT_SEARCH_FILING_VS_MAPPING_CATEGORIES_MAPPING.keys())}"
)
)
if single_forms:
single_list = [item for sublist in TEXT_SEARCH_CATEGORY_FORM_GROUPINGS.values() for item in
sublist]
single_list = [
item
for sublist in TEXT_SEARCH_CATEGORY_FORM_GROUPINGS.values()
for item in sublist
]
invalid_forms = [form for form in single_forms if form not in single_list]
if invalid_forms:
raise ValueError(
f"Single forms must be one or more of: {single_list}"
)
raise ValueError(f"Single forms must be one or more of: {single_list}")


class SecEdgarScraperCli:
Expand Down Expand Up @@ -135,7 +137,9 @@ def text_search(
scraper.text_search(
keywords=keywords,
entity_id=entity_id,
filing_form=TEXT_SEARCH_FILING_VS_MAPPING_CATEGORIES_MAPPING.get(filing_form),
filing_form=TEXT_SEARCH_FILING_VS_MAPPING_CATEGORIES_MAPPING.get(
filing_form
),
single_forms=single_forms,
start_date=start_date,
end_date=end_date,
Expand All @@ -144,7 +148,7 @@ def text_search(
retries=retries,
destination=output,
peo_in=peo_in,
inc_in=inc_in
inc_in=inc_in,
)

@staticmethod
Expand Down
6 changes: 2 additions & 4 deletions edgar_tool/io.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import csv
import json
from typing import List, Dict, Any, Iterator
from typing import Any, Dict, Iterator, List

import jsonlines

from edgar_tool.constants import (
SUPPORTED_OUTPUT_EXTENSIONS,
)
from edgar_tool.constants import SUPPORTED_OUTPUT_EXTENSIONS


def write_results_to_file(
Expand Down
3 changes: 2 additions & 1 deletion edgar_tool/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from edgar_tool.cli import SecEdgarScraperCli
import fire

from edgar_tool.cli import SecEdgarScraperCli


def main_entrypoint():
fire.Fire(SecEdgarScraperCli)
Expand Down
8 changes: 5 additions & 3 deletions edgar_tool/page_fetcher.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import time
import uuid
from random import uniform
from typing import Callable, Any, Optional
from typing import Any, Callable, Optional

from tenacity import retry, wait_fixed, stop_after_attempt
import requests
from tenacity import retry, stop_after_attempt, wait_fixed


def fetch_page(
Expand All @@ -23,6 +23,7 @@ def fetch_page(
:param stop_after_n: how many times to retry the request before failing
:return: wrapper function that takes a check method and retries the request if the page load fails
"""

@retry(
wait=wait_fixed(uniform(min_wait_seconds, max_wait_seconds)),
stop=stop_after_attempt(stop_after_n),
Expand Down Expand Up @@ -57,5 +58,6 @@ class ResultsTableNotFoundError(Exception):
class PageCheckFailedError(Exception):
pass


class NoResultsFoundError(Exception):
pass
pass
2 changes: 1 addition & 1 deletion edgar_tool/rss.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import uuid
from pathlib import Path
from typing import List, Any, Dict, Iterator, Tuple
from typing import Any, Dict, Iterator, List, Tuple

import requests
import xmltodict
Expand Down
45 changes: 28 additions & 17 deletions edgar_tool/text_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,23 @@
import urllib.parse
from datetime import date, timedelta
from math import ceil
from typing import List, Optional, Dict, Any, Iterator
from typing import Any, Dict, Iterator, List, Optional


from edgar_tool.page_fetcher import (
fetch_page,
PageCheckFailedError,
ResultsTableNotFoundError,
NoResultsFoundError
)
from edgar_tool.constants import (
TEXT_SEARCH_BASE_URL,
TEXT_SEARCH_CATEGORY_FORM_GROUPINGS,
TEXT_SEARCH_SPLIT_BATCHES_NUMBER,
TEXT_SEARCH_CSV_FIELDS_NAMES,
TEXT_SEARCH_FORM_MAPPING,
TEXT_SEARCH_LOCATIONS_MAPPING,
TEXT_SEARCH_SPLIT_BATCHES_NUMBER,
)
from edgar_tool.io import write_results_to_file
from edgar_tool.page_fetcher import (
NoResultsFoundError,
PageCheckFailedError,
ResultsTableNotFoundError,
fetch_page,
)
from edgar_tool.utils import split_date_range_in_n, unpack_singleton_list


Expand Down Expand Up @@ -130,7 +129,11 @@ def _parse_row(row: Dict[str, Any]) -> Dict[str, Any]:

places_of_business = _source.get("biz_locations")
places_of_business = [
f"{split[0]}, {TEXT_SEARCH_LOCATIONS_MAPPING.get(split[1])}" if len(split) == 2 else f"{split[0]}"
(
f"{split[0]}, {TEXT_SEARCH_LOCATIONS_MAPPING.get(split[1])}"
if len(split) == 2
else f"{split[0]}"
)
for place in places_of_business
if (split := place.rsplit(", ", maxsplit=1))
]
Expand Down Expand Up @@ -226,25 +229,31 @@ def _generate_request_args(

# Add optional parameters
if peo_in and inc_in:
raise ValueError("use only one of peo_in or inc_in, not both") ## because SEC API doesn't support
raise ValueError(
"use only one of peo_in or inc_in, not both"
) ## because SEC API doesn't support
else:
if peo_in:
request_args["locationCodes"] = peo_in
if inc_in:
request_args["locationCodes"] = inc_in
request_args["locationType"] = "incorporated"

if entity_id:
request_args["entityName"] = entity_id
# Handle forms and single forms
part_filing_form = [] if filing_form is None else TEXT_SEARCH_CATEGORY_FORM_GROUPINGS[filing_form]
part_filing_form = (
[]
if filing_form is None
else TEXT_SEARCH_CATEGORY_FORM_GROUPINGS[filing_form]
)
part_single_forms = [] if single_forms is None else single_forms

# Join the filing_forms and single forms and remove duplicates
forms = ",".join(list(set(part_filing_form + part_single_forms)))
if forms != "":
request_args["forms"] = forms

# URL-encode the request arguments
request_args = urllib.parse.urlencode(request_args)

Expand Down Expand Up @@ -373,7 +382,9 @@ def _generate_search_requests(
# If we have 10000 results, split date range in two separate requests and fetch first page again, do so until
# we have a set of date ranges for which none of the requests have 10000 results
if num_results == 0:
print(f"No results found for query in date range {start_date} -> {end_date}.")
print(
f"No results found for query in date range {start_date} -> {end_date}."
)
elif num_results < 10000:
print(
f"Less than 10000 ({num_results}) results found for range {start_date} -> {end_date}, "
Expand Down Expand Up @@ -475,7 +486,7 @@ def text_search(
print(
f"Skipping search request due to an unexpected {e.__class__.__name__} for request parameters '{r}': {e}"
)
if(search_requests_results == []):
if search_requests_results == []:
raise NoResultsFoundError(f"No results found for the search query")
write_results_to_file(
itertools.chain(*search_requests_results),
Expand Down Expand Up @@ -518,4 +529,4 @@ def _fetch_first_page_results_number(
raise NoResultsFoundError(
f"\nExecution aborting due to a {e.__class__.__name__} error raised "
f"while parsing number of results for first page at URL {url}: {e}"
) from e
) from e
Loading
Loading