bellingcat · GalenReich · Sep 18, 2024 · Sep 8, 2024
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -17,5 +17,5 @@
             ]
         }
     },
-    "postCreateCommand": "poetry install"
+    "postCreateCommand": "poetry install && poetry run pre-commit install"
 }
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,14 @@
+repos:
+  - repo: local
+    hooks:
+      - id: black
+        name: black
+        entry: poetry run black
+        language: system
+        types: [python]
+
+      - id: isort
+        name: isort
+        entry: poetry run isort
+        language: system
+        types: [python]
diff --git a/README.md b/README.md
@@ -203,13 +203,16 @@ pip install poetry
 
 # Install dependencies
 poetry install
+
+# Set up pre-commit hooks to keep your code formatted
+poetry run pre-commit install
 ```
 
 Check out [Important commands](#important-commands) below for next steps.
 
 ### Developing using a GitHub Codespace
 
-This project uses a custom Development Container supported by GitHub Codespaces. Creating a new Codespace automatically takes care of installing all supported Python interpreters, the Poetry package manager, and Python dependencies for you.
+This project uses a custom Development Container supported by GitHub Codespaces. Creating a new Codespace automatically takes care of installing all supported Python interpreters, the Poetry package manager, Python dependencies, and pre-commit hooks for you.
 
 To create a new Codespace:
 1. Click on the `<> Code` dropdown on the GitHub UI.

diff --git a/edgar_tool/cli.py b/edgar_tool/cli.py
@@ -1,17 +1,18 @@
 import sys
 import time
-from datetime import date, timedelta, datetime
+from datetime import date, datetime, timedelta
 from typing import List, Optional
 from warnings import warn
+
 from edgar_tool.constants import (
     SUPPORTED_OUTPUT_EXTENSIONS,
     TEXT_SEARCH_CATEGORY_FORM_GROUPINGS,
     TEXT_SEARCH_FILING_VS_MAPPING_CATEGORIES_MAPPING,
 )
+from edgar_tool.page_fetcher import NoResultsFoundError
 from edgar_tool.rss import fetch_rss_feed
 from edgar_tool.text_search import EdgarTextSearcher
 from edgar_tool.utils import parse_location_input
-from edgar_tool.page_fetcher import NoResultsFoundError
 
 
 def _validate_text_search_args(
@@ -57,15 +58,16 @@ def _validate_text_search_args(
     ):
         raise ValueError(
             f"Filing form group must be one of: {'; '.join(TEXT_SEARCH_FILING_VS_MAPPING_CATEGORIES_MAPPING.keys())}"
-    )
+        )
     if single_forms:
-        single_list = [item for sublist in TEXT_SEARCH_CATEGORY_FORM_GROUPINGS.values() for item in
-                       sublist]
+        single_list = [
+            item
+            for sublist in TEXT_SEARCH_CATEGORY_FORM_GROUPINGS.values()
+            for item in sublist
+        ]
         invalid_forms = [form for form in single_forms if form not in single_list]
         if invalid_forms:
-            raise ValueError(
-                f"Single forms must be one or more of: {single_list}"
-            )
+            raise ValueError(f"Single forms must be one or more of: {single_list}")
 
 
 class SecEdgarScraperCli:
@@ -135,7 +137,9 @@ def text_search(
         scraper.text_search(
             keywords=keywords,
             entity_id=entity_id,
-            filing_form=TEXT_SEARCH_FILING_VS_MAPPING_CATEGORIES_MAPPING.get(filing_form),
+            filing_form=TEXT_SEARCH_FILING_VS_MAPPING_CATEGORIES_MAPPING.get(
+                filing_form
+            ),
             single_forms=single_forms,
             start_date=start_date,
             end_date=end_date,
@@ -144,7 +148,7 @@ def text_search(
             retries=retries,
             destination=output,
             peo_in=peo_in,
-            inc_in=inc_in
+            inc_in=inc_in,
         )
 
     @staticmethod

diff --git a/edgar_tool/io.py b/edgar_tool/io.py
@@ -1,12 +1,10 @@
 import csv
 import json
-from typing import List, Dict, Any, Iterator
+from typing import Any, Dict, Iterator, List
 
 import jsonlines
 
-from edgar_tool.constants import (
-    SUPPORTED_OUTPUT_EXTENSIONS,
-)
+from edgar_tool.constants import SUPPORTED_OUTPUT_EXTENSIONS
 
 
 def write_results_to_file(

diff --git a/edgar_tool/main.py b/edgar_tool/main.py
@@ -1,6 +1,7 @@
-from edgar_tool.cli import SecEdgarScraperCli
 import fire
 
+from edgar_tool.cli import SecEdgarScraperCli
+
 
 def main_entrypoint():
     fire.Fire(SecEdgarScraperCli)

diff --git a/edgar_tool/page_fetcher.py b/edgar_tool/page_fetcher.py
@@ -1,10 +1,10 @@
 import time
 import uuid
 from random import uniform
-from typing import Callable, Any, Optional
+from typing import Any, Callable, Optional
 
-from tenacity import retry, wait_fixed, stop_after_attempt
 import requests
+from tenacity import retry, stop_after_attempt, wait_fixed
 
 
 def fetch_page(
@@ -23,6 +23,7 @@ def fetch_page(
     :param stop_after_n: how many times to retry the request before failing
     :return: wrapper function that takes a check method and retries the request if the page load fails
     """
+
     @retry(
         wait=wait_fixed(uniform(min_wait_seconds, max_wait_seconds)),
         stop=stop_after_attempt(stop_after_n),
@@ -57,5 +58,6 @@ class ResultsTableNotFoundError(Exception):
 class PageCheckFailedError(Exception):
     pass
 
+
 class NoResultsFoundError(Exception):
-    pass
+    pass
diff --git a/edgar_tool/rss.py b/edgar_tool/rss.py
@@ -1,7 +1,7 @@
 import json
 import uuid
 from pathlib import Path
-from typing import List, Any, Dict, Iterator, Tuple
+from typing import Any, Dict, Iterator, List, Tuple
 
 import requests
 import xmltodict

diff --git a/edgar_tool/text_search.py b/edgar_tool/text_search.py
@@ -4,24 +4,23 @@
 import urllib.parse
 from datetime import date, timedelta
 from math import ceil
-from typing import List, Optional, Dict, Any, Iterator
+from typing import Any, Dict, Iterator, List, Optional
 
-
-from edgar_tool.page_fetcher import (
-    fetch_page,
-    PageCheckFailedError,
-    ResultsTableNotFoundError,
-    NoResultsFoundError
-)
 from edgar_tool.constants import (
     TEXT_SEARCH_BASE_URL,
     TEXT_SEARCH_CATEGORY_FORM_GROUPINGS,
-    TEXT_SEARCH_SPLIT_BATCHES_NUMBER,
     TEXT_SEARCH_CSV_FIELDS_NAMES,
     TEXT_SEARCH_FORM_MAPPING,
     TEXT_SEARCH_LOCATIONS_MAPPING,
+    TEXT_SEARCH_SPLIT_BATCHES_NUMBER,
 )
 from edgar_tool.io import write_results_to_file
+from edgar_tool.page_fetcher import (
+    NoResultsFoundError,
+    PageCheckFailedError,
+    ResultsTableNotFoundError,
+    fetch_page,
+)
 from edgar_tool.utils import split_date_range_in_n, unpack_singleton_list
 
 
@@ -130,7 +129,11 @@ def _parse_row(row: Dict[str, Any]) -> Dict[str, Any]:
 
         places_of_business = _source.get("biz_locations")
         places_of_business = [
-            f"{split[0]}, {TEXT_SEARCH_LOCATIONS_MAPPING.get(split[1])}" if len(split) == 2 else f"{split[0]}"
+            (
+                f"{split[0]}, {TEXT_SEARCH_LOCATIONS_MAPPING.get(split[1])}"
+                if len(split) == 2
+                else f"{split[0]}"
+            )
             for place in places_of_business
             if (split := place.rsplit(", ", maxsplit=1))
         ]
@@ -226,25 +229,31 @@ def _generate_request_args(
 
         # Add optional parameters
         if peo_in and inc_in:
-            raise ValueError("use only one of peo_in or inc_in, not both") ## because SEC API doesn't support
+            raise ValueError(
+                "use only one of peo_in or inc_in, not both"
+            )  ## because SEC API doesn't support
         else:
             if peo_in:
                 request_args["locationCodes"] = peo_in
             if inc_in:
                 request_args["locationCodes"] = inc_in
                 request_args["locationType"] = "incorporated"
-        
+
         if entity_id:
             request_args["entityName"] = entity_id
         # Handle forms and single forms
-        part_filing_form = [] if filing_form is None else TEXT_SEARCH_CATEGORY_FORM_GROUPINGS[filing_form]
+        part_filing_form = (
+            []
+            if filing_form is None
+            else TEXT_SEARCH_CATEGORY_FORM_GROUPINGS[filing_form]
+        )
         part_single_forms = [] if single_forms is None else single_forms
 
         # Join the filing_forms and single forms and remove duplicates
         forms = ",".join(list(set(part_filing_form + part_single_forms)))
         if forms != "":
             request_args["forms"] = forms
- 
+
         # URL-encode the request arguments
         request_args = urllib.parse.urlencode(request_args)
 
@@ -373,7 +382,9 @@ def _generate_search_requests(
         # If we have 10000 results, split date range in two separate requests and fetch first page again, do so until
         # we have a set of date ranges for which none of the requests have 10000 results
         if num_results == 0:
-            print(f"No results found for query in date range {start_date} -> {end_date}.")
+            print(
+                f"No results found for query in date range {start_date} -> {end_date}."
+            )
         elif num_results < 10000:
             print(
                 f"Less than 10000 ({num_results}) results found for range {start_date} -> {end_date}, "
@@ -475,7 +486,7 @@ def text_search(
                 print(
                     f"Skipping search request due to an unexpected {e.__class__.__name__} for request parameters '{r}': {e}"
                 )
-        if(search_requests_results == []):
+        if search_requests_results == []:
             raise NoResultsFoundError(f"No results found for the search query")
         write_results_to_file(
             itertools.chain(*search_requests_results),
@@ -518,4 +529,4 @@ def _fetch_first_page_results_number(
             raise NoResultsFoundError(
                 f"\nExecution aborting due to a {e.__class__.__name__} error raised "
                 f"while parsing number of results for first page at URL {url}: {e}"
-            ) from e
+            ) from e