From a26168d91cf82655ff6bd1b453165cb567c4fce6 Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Tue, 30 Apr 2024 22:34:26 -0700 Subject: [PATCH 1/5] feat: Split markdown files when larger than max issue body size Signed-off-by: Zack Koppert --- .gitignore | 2 +- README.md | 1 + docs/dealing-with-large-issue-metrics.md | 16 +++++ issue_metrics.py | 15 +++++ markdown_helpers.py | 38 ++++++++++++ test_markdown_helpers.py | 75 ++++++++++++++++++++++++ 6 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 docs/dealing-with-large-issue-metrics.md create mode 100644 markdown_helpers.py create mode 100644 test_markdown_helpers.py diff --git a/.gitignore b/.gitignore index 5aab725..4f454b1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ # Output files -issue_metrics.md +issue_metrics*.md issue_metrics.json # Byte-compiled / optimized / DLL files diff --git a/README.md b/README.md index 801ea85..c6b7ea9 100644 --- a/README.md +++ b/README.md @@ -161,6 +161,7 @@ This action can be configured to authenticate with GitHub App Installation or Pe - [Configuring the `SEARCH_QUERY`](./docs/search-query.md) - [Local usage without Docker](./docs/local-usage-without-docker.md) - [Authenticating with GitHub App Installation](./docs/authenticating-with-github-app-installation.md) +- [Dealing with large issue_metrics.md files](./docs/dealing-with-large-issue-metrics.md) ## Contributions diff --git a/docs/dealing-with-large-issue-metrics.md b/docs/dealing-with-large-issue-metrics.md new file mode 100644 index 0000000..1a85e6c --- /dev/null +++ b/docs/dealing-with-large-issue-metrics.md @@ -0,0 +1,16 @@ +# Dealing with large issue metrics markdown files + +When working with lots of issues/pull requests/discussion results, the resulting issue_metrics.md file can become very large. This can cause the GitHub API to return an error when trying to create an issue with the contents of the file. + +```shell +Pull request creation failed. Validation failed: Body is too long (maximum is 65536 characters) +``` + +To work around this limitation, the issue-metrics action detects the issue and splits the issue_metrics.md file into smaller files. So instead of issue_metrics.md, you will get issue_metrics_0.md, issue_metrics_1.md, etc. Since we don't want the action to fail, it has been designed to have the same name as usual for the first split file (issue_metrics.md) and then append a number to the name for the subsequent split files. + +You can choose one of the following strategies to deal with the split files: +- Create multiple issues, each with using the next split file in the sequence. +- Upload the full file as an artifact and link to it in the issue body. +- Create an issue and put the content of the split files as issue comments. + +JSON output files are not split since its not anticipated that you use them as issue body content. diff --git a/issue_metrics.py b/issue_metrics.py index ef79e51..ec85210 100644 --- a/issue_metrics.py +++ b/issue_metrics.py @@ -20,6 +20,7 @@ main(): Run the issue-metrics script. """ +import shutil import sys from typing import List, Union @@ -30,6 +31,7 @@ from discussions import get_discussions from json_writer import write_to_json from labels import get_label_metrics, get_stats_time_in_labels +from markdown_helpers import markdown_too_large_for_issue_body, split_markdown_file from markdown_writer import write_to_markdown from most_active_mentors import count_comments_per_user, get_mentor_count from time_to_answer import get_stats_time_to_answer, measure_time_to_answer @@ -364,6 +366,7 @@ def main(): num_mentor_count, search_query, ) + write_to_markdown( issues_with_metrics, stats_time_to_first_response, @@ -377,6 +380,18 @@ def main(): search_query, ) + max_char_count = 65535 + if markdown_too_large_for_issue_body("issue_metrics.md", max_char_count): + split_markdown_file("issue_metrics.md", max_char_count) + shutil.move("issue_metrics.md", "issue_metrics_full.md") + shutil.move("issue_metrics_0.md", "issue_metrics.md") + print( + "Issue metrics markdown file is too large for GitHub issue body and has been \ + split into multiple files. ie. issue_metrics.md, issue_metrics_1.md, etc. \ + The full file is saved as issue_metrics_full.md\n\ + See https://github.com/github/issue-metrics/blob/main/docs/dealing-with-large-issue-metrics.md" + ) + if __name__ == "__main__": main() diff --git a/markdown_helpers.py b/markdown_helpers.py new file mode 100644 index 0000000..8faf39a --- /dev/null +++ b/markdown_helpers.py @@ -0,0 +1,38 @@ +""" Helper functions for working with markdown files. """ + + +def markdown_too_large_for_issue_body(file_path: str, max_char_count: int) -> bool: + """ + Check if the markdown file is too large to fit into a github issue. + + Inputs: + file_path: str - the path to the markdown file to check + max_char_count: int - the maximum number of characters allowed in a github issue body + + Returns: + bool - True if the file is too large, False otherwise + + """ + with open(file_path, "r", encoding="utf-8") as file: + file_contents = file.read() + return len(file_contents) > max_char_count + + +def split_markdown_file(file_path: str, max_char_count: int) -> None: + """ + Split the markdown file into smaller files. + + Inputs: + file_path: str - the path to the markdown file to split + max_char_count: int - the maximum number of characters allowed before splitting markdown file + + """ + with open(file_path, "r", encoding="utf-8") as file: + file_contents = file.read() + contents_list = [ + file_contents[i : i + max_char_count] + for i in range(0, len(file_contents), max_char_count) + ] + for i, content in enumerate(contents_list): + with open(f"{file_path[:-3]}_{i}.md", "w", encoding="utf-8") as new_file: + new_file.write(content) diff --git a/test_markdown_helpers.py b/test_markdown_helpers.py new file mode 100644 index 0000000..b09c739 --- /dev/null +++ b/test_markdown_helpers.py @@ -0,0 +1,75 @@ +""" Unit tests for the markdown_helpers module. """ + +import os +import unittest + +from markdown_helpers import markdown_too_large_for_issue_body, split_markdown_file + + +class TestMarkdownHelpers(unittest.TestCase): + """ + Unit tests for the markdown_helpers module. + """ + + def test_markdown_too_large_for_issue_body(self): + """ + Test the markdown_too_large_for_issue_body function. + """ + # Define a sample markdown file content + max_char_count = 65535 + markdown_content = "a\n" * max_char_count + + # Write the markdown content to a temporary file + with open("temp.md", "w", encoding="utf-8") as f: + f.write(markdown_content) + + # Call the function with the temporary file + result = markdown_too_large_for_issue_body("temp.md", max_char_count) + + # remove the temporary file + os.remove("temp.md") + + # Assert that the function returns True + self.assertTrue(result) + + def test_split_markdown_file(self): + """ + Test the split_markdown_file function. + """ + + # Define a sample markdown file content with 3 times the maximum character count + multiple_of_max = 4 + max_char_count = 65535 + repeated_content = "a\n" + markdown_content = repeated_content * int( + (max_char_count * multiple_of_max) / len(repeated_content) + ) + + # Write the markdown content to a temporary file + with open("temp.md", "w", encoding="utf-8") as f: + f.write(markdown_content) + + # Call the function with the temporary file + split_markdown_file("temp.md", max_char_count) + + # Assert that the function creates two files + self.assertTrue(os.path.exists("temp_0.md")) + self.assertTrue(os.path.exists("temp_1.md")) + self.assertTrue(os.path.exists("temp_2.md")) + self.assertTrue(os.path.exists("temp_3.md")) + + # Assert that the all files have less than max characters + for i in range(0, multiple_of_max): + with open(f"temp_{i}.md", "r", encoding="utf-8") as f: + self.assertLessEqual(len(f.read()), max_char_count) + + # remove the temporary files + os.remove("temp.md") + os.remove("temp_0.md") + os.remove("temp_1.md") + os.remove("temp_2.md") + os.remove("temp_3.md") + + +if __name__ == "__main__": + unittest.main() From c9c7a478d063a455511c5cbd313debd127c465d6 Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Tue, 30 Apr 2024 22:46:08 -0700 Subject: [PATCH 2/5] fix: formatting errors Signed-off-by: Zack Koppert --- .github/linters/.flake8 | 2 +- docs/dealing-with-large-issue-metrics.md | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/linters/.flake8 b/.github/linters/.flake8 index 1aaf829..de0917a 100644 --- a/.github/linters/.flake8 +++ b/.github/linters/.flake8 @@ -1,5 +1,5 @@ [flake8] exclude = venv,.venv,.git,__pycache__ -extend-ignore = C901 +extend-ignore = C901, E203 max-line-length = 150 statistics = True diff --git a/docs/dealing-with-large-issue-metrics.md b/docs/dealing-with-large-issue-metrics.md index 1a85e6c..d050b51 100644 --- a/docs/dealing-with-large-issue-metrics.md +++ b/docs/dealing-with-large-issue-metrics.md @@ -1,4 +1,4 @@ -# Dealing with large issue metrics markdown files +# Dealing with large issue metrics Markdown files When working with lots of issues/pull requests/discussion results, the resulting issue_metrics.md file can become very large. This can cause the GitHub API to return an error when trying to create an issue with the contents of the file. @@ -6,7 +6,8 @@ When working with lots of issues/pull requests/discussion results, the resulting Pull request creation failed. Validation failed: Body is too long (maximum is 65536 characters) ``` -To work around this limitation, the issue-metrics action detects the issue and splits the issue_metrics.md file into smaller files. So instead of issue_metrics.md, you will get issue_metrics_0.md, issue_metrics_1.md, etc. Since we don't want the action to fail, it has been designed to have the same name as usual for the first split file (issue_metrics.md) and then append a number to the name for the subsequent split files. +To work around this limitation, the issue-metrics action detects the issue and splits the issue_metrics.md file into smaller files. So instead of issue_metrics.md, you will get issue_metrics_0.md, issue_metrics_1.md, etc. +Since we don't want the action to fail, it has been designed to have the same name as usual for the first split file (issue_metrics.md) and then append a number to the name for the subsequent split files. You can choose one of the following strategies to deal with the split files: - Create multiple issues, each with using the next split file in the sequence. From 9e4b1f3f94f2826cc86c7c75f8150c57a457980f Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Wed, 1 May 2024 23:37:26 -0700 Subject: [PATCH 3/5] Update dealing-with-large-issue-metrics.md --- docs/dealing-with-large-issue-metrics.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/dealing-with-large-issue-metrics.md b/docs/dealing-with-large-issue-metrics.md index d050b51..5c22451 100644 --- a/docs/dealing-with-large-issue-metrics.md +++ b/docs/dealing-with-large-issue-metrics.md @@ -1,6 +1,6 @@ # Dealing with large issue metrics Markdown files -When working with lots of issues/pull requests/discussion results, the resulting issue_metrics.md file can become very large. This can cause the GitHub API to return an error when trying to create an issue with the contents of the file. +When working with lots of issues/pull requests/discussion results, the resulting issue_metrics.md file can become very large. This can cause the GitHub API to return an error when trying to create an issue with the contents of the file. ```shell Pull request creation failed. Validation failed: Body is too long (maximum is 65536 characters) From 5a8575f0829d5ec719280f6c8ce57d122940e205 Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Wed, 1 May 2024 23:37:53 -0700 Subject: [PATCH 4/5] more specific language --- docs/dealing-with-large-issue-metrics.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/dealing-with-large-issue-metrics.md b/docs/dealing-with-large-issue-metrics.md index 5c22451..c2247f6 100644 --- a/docs/dealing-with-large-issue-metrics.md +++ b/docs/dealing-with-large-issue-metrics.md @@ -6,7 +6,7 @@ When working with lots of issues/pull requests/discussion results, the resulting Pull request creation failed. Validation failed: Body is too long (maximum is 65536 characters) ``` -To work around this limitation, the issue-metrics action detects the issue and splits the issue_metrics.md file into smaller files. So instead of issue_metrics.md, you will get issue_metrics_0.md, issue_metrics_1.md, etc. +To work around this limitation, the issue-metrics action detects the large file size and splits the issue_metrics.md file into smaller files. So instead of issue_metrics.md, you will get issue_metrics_0.md, issue_metrics_1.md, etc. Since we don't want the action to fail, it has been designed to have the same name as usual for the first split file (issue_metrics.md) and then append a number to the name for the subsequent split files. You can choose one of the following strategies to deal with the split files: From 744f9dc2ca547e2c8c75fea2747810064c26a3bd Mon Sep 17 00:00:00 2001 From: Zack Koppert Date: Wed, 1 May 2024 23:38:29 -0700 Subject: [PATCH 5/5] comment to match code --- test_markdown_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_markdown_helpers.py b/test_markdown_helpers.py index b09c739..d856a62 100644 --- a/test_markdown_helpers.py +++ b/test_markdown_helpers.py @@ -37,7 +37,7 @@ def test_split_markdown_file(self): Test the split_markdown_file function. """ - # Define a sample markdown file content with 3 times the maximum character count + # Define a sample markdown file content with 4 times the maximum character count multiple_of_max = 4 max_char_count = 65535 repeated_content = "a\n"