-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Pandas performance & automated release #144
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
--- | ||
name: build-package-release | ||
|
||
on: push | ||
|
||
jobs: | ||
build: | ||
name: Build Python Distribution | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout source | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 2 | ||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: "3.11" | ||
- name: Build source and wheel distributions | ||
run: | | ||
python -m pip install --upgrade build twine | ||
python -m build | ||
twine check --strict dist/* | ||
- name: Store the distribution packages | ||
uses: actions/upload-artifact@v3 | ||
with: | ||
name: python-package-distributions | ||
path: dist/ | ||
|
||
pypi-publish: | ||
name: Upload release to PyPI | ||
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') | ||
needs: | ||
- build | ||
runs-on: ubuntu-latest | ||
environment: | ||
name: pypi | ||
url: https://pypi.org/p/catalystcoop.ferc-xbrl-extractor | ||
permissions: | ||
id-token: write | ||
steps: | ||
- name: Download all the dists | ||
uses: actions/download-artifact@v3 | ||
with: | ||
name: python-package-distributions | ||
path: dist/ | ||
- name: Publish distribution to PyPI | ||
uses: pypa/gh-action-pypi-publish@release/v1 | ||
|
||
github-publish: | ||
name: Release package on GitHub | ||
needs: | ||
- pypi-publish | ||
runs-on: ubuntu-latest | ||
permissions: | ||
contents: write | ||
id-token: write | ||
steps: | ||
- name: Download all the dists | ||
uses: actions/download-artifact@v3 | ||
with: | ||
name: python-package-distributions | ||
path: dist/ | ||
- name: Sign dist with Sigstore | ||
uses: sigstore/[email protected] | ||
with: | ||
inputs: ./dist/*.tar.gz ./dist/*.whl | ||
- name: Upload artifact signatures to GitHub Release | ||
env: | ||
GITHUB_TOKEN: ${{ github.token }} | ||
run: >- | ||
gh release create '${{ github.ref_name }}' ./dist/*.tar.gz ./dist/*.whl | ||
--generate-notes | ||
--repo '${{ github.repository }}' | ||
|
||
release-notify: | ||
runs-on: ubuntu-latest | ||
needs: | ||
- github-publish | ||
if: ${{ always() }} | ||
steps: | ||
- name: Inform the Codemonkeys | ||
uses: 8398a7/action-slack@v3 | ||
continue-on-error: true | ||
with: | ||
status: custom | ||
fields: workflow,job,commit,repo,ref,author,took | ||
custom_payload: | | ||
{ | ||
username: 'action-slack', | ||
icon_emoji: ':octocat:', | ||
attachments: [{ | ||
color: '${{ needs.github-publish.result }}' === 'success' ? 'good' : '${{ needs.github-publish.result }}' === 'failure' ? 'danger' : 'warning', | ||
text: `${process.env.AS_REPO}@${process.env.AS_REF}\n ${process.env.AS_WORKFLOW} (${process.env.AS_COMMIT})\n by ${process.env.AS_AUTHOR}\n Status: ${{ needs.github-publish.result }}`, | ||
}] | ||
} | ||
env: | ||
GITHUB_TOKEN: ${{ github.token }} # required | ||
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} # required | ||
MATRIX_CONTEXT: ${{ toJson(matrix) }} # required |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -84,9 +84,11 @@ def _dedupe_newer_report_wins(df: pd.DataFrame, primary_key: list[str]) -> pd.Da | |
old_index = df.index.names | ||
return ( | ||
df.reset_index() | ||
.sort_values("report_date") | ||
.groupby(unique_cols) | ||
.apply(lambda df: df.sort_values("report_date").ffill().iloc[-1]) | ||
.last() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perfect! That's the exact intent & also this conveys what's going on way better! |
||
.drop("report_date", axis="columns") | ||
.reset_index() | ||
.set_index(old_index) | ||
) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,7 +19,7 @@ def test_extractor_scripts(script_runner, ep): | |
|
||
The script_runner fixture is provided by the pytest-console-scripts plugin. | ||
""" | ||
ret = script_runner.run(ep, "--help", print_result=False) | ||
ret = script_runner.run([ep, "--help"], print_result=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just a switch from deprecated syntax. |
||
assert ret.success # nosec: B101 | ||
|
||
|
||
|
@@ -38,19 +38,21 @@ def test_extract_example_filings(script_runner, tmp_path, test_dir): | |
data_dir = test_dir / "integration" / "data" | ||
|
||
ret = script_runner.run( | ||
"xbrl_extract", | ||
str(data_dir / "ferc1-xbrl-2021.zip"), | ||
str(out_db), | ||
"--taxonomy", | ||
str(data_dir / "taxonomy.zip"), | ||
"--entry-point", | ||
"taxonomy/form1/2021-01-01/form/form1/form-1_2021-01-01.xsd", | ||
"--metadata-path", | ||
str(metadata), | ||
"--datapackage-path", | ||
str(datapackage), | ||
"--logfile", | ||
str(log_file), | ||
[ | ||
"xbrl_extract", | ||
str(data_dir / "ferc1-xbrl-2021.zip"), | ||
str(out_db), | ||
"--taxonomy", | ||
str(data_dir / "taxonomy.zip"), | ||
"--entry-point", | ||
"taxonomy/form1/2021-01-01/form/form1/form-1_2021-01-01.xsd", | ||
"--metadata-path", | ||
str(metadata), | ||
"--datapackage-path", | ||
str(datapackage), | ||
"--logfile", | ||
str(log_file), | ||
] | ||
) | ||
|
||
assert ret.success | ||
|
@@ -72,15 +74,17 @@ def test_extract_example_filings_no_explicit_taxonomy( | |
data_dir = test_dir / "integration" / "data" | ||
|
||
ret = script_runner.run( | ||
"xbrl_extract", | ||
str(data_dir / "ferc1-xbrl-2021.zip"), | ||
str(out_db), | ||
"--metadata-path", | ||
str(metadata), | ||
"--datapackage-path", | ||
str(datapackage), | ||
"--logfile", | ||
str(log_file), | ||
[ | ||
"xbrl_extract", | ||
str(data_dir / "ferc1-xbrl-2021.zip"), | ||
str(out_db), | ||
"--metadata-path", | ||
str(metadata), | ||
"--datapackage-path", | ||
str(datapackage), | ||
"--logfile", | ||
str(log_file), | ||
] | ||
) | ||
|
||
assert ret.success | ||
|
@@ -100,17 +104,19 @@ def test_extract_example_filings_bad_form(script_runner, tmp_path, test_dir): | |
data_dir = test_dir / "integration" / "data" | ||
|
||
ret = script_runner.run( | ||
"xbrl_extract", | ||
str(data_dir / "ferc1-xbrl-2021.zip"), | ||
str(out_db), | ||
"--form-number", | ||
"666", | ||
"--metadata-path", | ||
str(metadata), | ||
"--datapackage-path", | ||
str(datapackage), | ||
"--logfile", | ||
str(log_file), | ||
[ | ||
"xbrl_extract", | ||
str(data_dir / "ferc1-xbrl-2021.zip"), | ||
str(out_db), | ||
"--form-number", | ||
"666", | ||
"--metadata-path", | ||
str(metadata), | ||
"--datapackage-path", | ||
str(datapackage), | ||
"--logfile", | ||
str(log_file), | ||
] | ||
) | ||
|
||
assert not ret.success |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,6 @@ allowlist_externals = | |
bash | ||
coverage | ||
sphinx-build | ||
twine | ||
# shared directory for re-used packages | ||
envdir = {toxinidir}/.env_tox | ||
passenv = | ||
|
@@ -109,37 +108,3 @@ commands = | |
{[testenv:integration]commands} | ||
{[testenv]covreport} | ||
|
||
####################################################################################### | ||
# Software Package Build & Release | ||
####################################################################################### | ||
[testenv:build] | ||
description = Prepare Python source and binary packages for release. | ||
basepython = python3 | ||
skip_install = false | ||
extras = | ||
dev | ||
commands = | ||
bash -c 'rm -rf build/* dist/*' | ||
python -m build | ||
|
||
[testenv:testrelease] | ||
description = Do a dry run of Python package release using the PyPI test server. | ||
basepython = python3 | ||
skip_install = false | ||
extras = | ||
dev | ||
commands = | ||
{[testenv:build]commands} | ||
twine check dist/* | ||
twine upload --verbose --repository testpypi --skip-existing dist/* | ||
|
||
[testenv:release] | ||
description = Release the package to the production PyPI server. | ||
basepython = python3 | ||
skip_install = false | ||
extras = | ||
dev | ||
commands = | ||
{[testenv:build]commands} | ||
twine check dist/* | ||
twine upload --verbose --skip-existing dist/* | ||
Comment on lines
-112
to
-145
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed because it takes place in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wooohoo! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we can
sort_values()
before doing the groupby because according to the groupby docs "Groupby preserves the order of rows within each group."There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One potentially non-deterministic outcome here is that if there are duplicate values of
report_date
within a group then which row of data ends up being "last" might not always be the same. Do we think there can ever be multiple filings within a group that have the same date?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes!
For some reason, I made this issue in the PUDL repo instead: catalyst-cooperative/pudl#2822 but the short of it is "we need to use metadata from
rssfeed
to sort them better."