hyperskill · nbirillo · Oct 24, 2023 · Oct 16, 2023 · Oct 16, 2023 · Oct 16, 2023
diff --git a/.flake8 b/.flake8
@@ -51,6 +51,8 @@ ignore =
     WPS347,
     # Forbid starting lines with a dot
     WPS348,
+    # Forbids to unpack iterable objects to lists. Disabled due to false-positives with pandas
+    WPS359,
     # Forbid mutable constants on a module level
     WPS407,
     # Forbid logic inside __init__ module
@@ -65,6 +67,8 @@ ignore =
     WPS432,
     # Forbid overlapping local and block variables. Disabled due to false-positives
     WPS440,
+    # Forbids direct usage of multiline strings
+    WPS462,
     # Forbid comparisons between bitwise and boolean expressions
     WPS465,
     # Forbid @staticmethod decorator

diff --git a/data_labelling/src/hyperstyle/evaluate.py b/data_labelling/src/hyperstyle/evaluate.py
@@ -49,6 +49,7 @@ def main():
     config = HyperstyleEvaluationConfig(tool_path=args.tool_path,
                                         allow_duplicates=args.allow_duplicates,
                                         with_all_categories=args.with_all_categories,
+                                        new_format=False,
                                         tmp_path=args.tmp_directory,
                                         disable=args.disable,
                                         working_directory=args.working_directory,

diff --git a/data_labelling/src/hyperstyle/hyperstyle_evaluation_config.py b/data_labelling/src/hyperstyle/hyperstyle_evaluation_config.py
@@ -1,8 +1,9 @@
+import json
 import logging.config
 import platform
 import sys
 from pathlib import Path
-from typing import List, Optional, Union
+from typing import List, Optional, Union, Dict
 
 from hyperstyle.src.python.review.application_config import LanguageVersion
 
@@ -12,17 +13,23 @@
 
 HYPERSTYLE_TOOL_PATH = 'review/hyperstyle/src/python/review/run_tool.py'
 
+IJConfig = Dict[str, Dict[str, str]]
+
 
 class HyperstyleEvaluationConfig(EvaluationConfig):
-    def __init__(self,
-                 tool_path: str,
-                 allow_duplicates: bool,
-                 with_all_categories: bool,
-                 tmp_path: Path,
-                 n_cpu: Optional[int] = None,
-                 disable: Optional[str] = None,
-                 working_directory: Optional[str] = None,
-                 venv: Optional[str] = None):
+    def __init__(
+        self,
+        tool_path: str,
+        allow_duplicates: bool,
+        with_all_categories: bool,
+        new_format: bool,
+        tmp_path: Path,
+        n_cpu: Optional[int] = None,
+        disable: Optional[str] = None,
+        working_directory: Optional[str] = None,
+        venv: Optional[str | Path] = None,
+        ij_config: Optional[IJConfig] = None,
+    ):
         """
         `tool_path` - path to hyperstyle tool running script (custom or HYPERSTYLE_TOOL_PATH)
         `tmp_path` - path where to place evaluation temporary files
@@ -37,10 +44,12 @@ def __init__(self,
 
         self.allow_duplicates: bool = allow_duplicates
         self.with_all_categories: bool = with_all_categories
+        self.new_format: bool = new_format
         self.n_cpu: int = n_cpu
         self.disable: Optional[str] = disable
+        self.ij_config: Optional[IJConfig] = ij_config
         self.working_directory: Optional[str] = working_directory
-        self.venv: Optional[str] = venv
+        self.venv: Optional[str | Path] = venv
 
     def build_command(self,
                       input_path: Union[str, Path],
@@ -73,6 +82,9 @@ def build_command(self,
         if self.disable:
             python_command += ['--disable', self.disable]
 
+        if self.ij_config:
+            python_command += ['--ij-config', json.dumps(self.ij_config)]
+
         if platform.system() == 'Darwin':
             bash_prefix = None
         else:

diff --git a/data_labelling/src/utils/evaluation_utils.py b/data_labelling/src/utils/evaluation_utils.py
@@ -1,9 +1,10 @@
 import logging
+import subprocess
+import time
 from pathlib import Path
 from typing import Callable, List, TypeVar, Optional, Tuple
 
 import pandas as pd
-import time
 
 from core.src.model.column_name import SubmissionColumns
 from core.src.utils.file.file_utils import create_directory, remove_directory, create_file
@@ -70,7 +71,7 @@ def evaluate_command(command: List[str], working_directory: Optional[str] = None
     logger.info('Start evaluation')
     start = time.time()
 
-    logger.info(f'Executing command: {" ".join(command)}')
+    logger.info(f'Executing command: {subprocess.list2cmdline(command)}')
     output, _ = run_in_subprocess(command, working_directory=working_directory)
 
     end = time.time()

diff --git a/jba/src/inspections/__init__.py b/jba/src/inspections/__init__.py
diff --git a/jba/src/inspections/analysis.py b/jba/src/inspections/analysis.py
@@ -0,0 +1,116 @@
+from collections import Counter
+from itertools import islice
+from typing import Set, List, Tuple
+
+import pandas as pd
+
+from core.src.model.column_name import SubmissionColumns
+from core.src.model.quality.issue.hyperstyle_issue import HyperstyleIssue
+from jba.src.models.edu_columns import EduColumnName
+
+
+def find_unique_inspections(group: pd.DataFrame) -> Set[str]:
+    return set(
+        group[EduColumnName.INSPECTIONS.value]
+        .apply(lambda inspections: {inspection.code for inspection in inspections})
+        .explode()
+        .unique()
+    )
+
+
+def find_fixed_unique_inspections(group: pd.DataFrame) -> Set[str]:
+    unique_inspections = find_unique_inspections(group)
+
+    last_attempt = group.loc[group[SubmissionColumns.ATTEMPT.value].idxmax()].squeeze()
+    last_attempt_inspections = {inspection.code for inspection in last_attempt[EduColumnName.INSPECTIONS.value]}
+
+    return unique_inspections - last_attempt_inspections
+
+
+def find_not_fixed_unique_inspections(group: pd.DataFrame) -> Set[str]:
+    not_fixed_unique_inspections = find_unique_inspections(group)
+
+    for previous_row, current_row in zip(group.itertuples(index=False), islice(group.itertuples(index=False), 1, None)):
+        previous_number_of_inspections = Counter(
+            inspection.code for inspection in getattr(previous_row, EduColumnName.INSPECTIONS.value)
+        )
+
+        current_number_of_inspections = Counter(
+            inspection.code for inspection in getattr(current_row, EduColumnName.INSPECTIONS.value)
+        )
+
+        for inspection in previous_number_of_inspections.keys() | current_number_of_inspections.keys():
+            if current_number_of_inspections.get(inspection, 0) < previous_number_of_inspections.get(inspection, 0):
+                not_fixed_unique_inspections.discard(inspection)
+
+    return not_fixed_unique_inspections
+
+
+def get_unique_inspections_stats(
+    df: pd.DataFrame,
+    inspections_to_ignore: List[str],
+    normalize: bool = True,
+) -> pd.DataFrame:
+    unique_inspections = (
+        df.groupby(SubmissionColumns.GROUP.value).apply(find_unique_inspections).explode().value_counts()
+    )
+
+    unique_inspections.name = 'Total'
+
+    fixed_unique_inspections = (
+        df.groupby(SubmissionColumns.GROUP.value).apply(find_fixed_unique_inspections).explode().value_counts()
+    )
+
+    fixed_unique_inspections.name = 'Fixed'
+
+    not_fixed_unique_inspections = (
+        df.groupby(SubmissionColumns.GROUP.value).apply(find_not_fixed_unique_inspections).explode().value_counts()
+    )
+
+    not_fixed_unique_inspections.name = 'Not fixed'
+
+    stats = (
+        pd.concat(
+            [unique_inspections, fixed_unique_inspections, not_fixed_unique_inspections],
+            axis=1,
+        )
+        .fillna(0)
+        .convert_dtypes()
+    )
+
+    stats['Partially fixed'] = stats['Total'] - stats['Fixed'] - stats['Not fixed']
+
+    stats = stats[~stats.index.isin(inspections_to_ignore)]
+    stats.index.name = 'Inspection'
+
+    if normalize:
+        stats = stats / df[SubmissionColumns.GROUP.value].nunique() * 100
+
+    return stats
+
+
+def get_inspection_fixing_examples(  # noqa: WPS234
+    group: pd.DataFrame,
+    inspection_name: str,
+) -> List[Tuple[List[HyperstyleIssue], str, List[HyperstyleIssue], str]]:
+    examples = []
+
+    for previous_row, current_row in zip(group.itertuples(index=False), islice(group.itertuples(index=False), 1, None)):
+        previous_issues = [
+            inspection
+            for inspection in getattr(previous_row, EduColumnName.INSPECTIONS.value)
+            if inspection.code == inspection_name
+        ]
+
+        current_issues = [
+            inspection
+            for inspection in getattr(current_row, EduColumnName.INSPECTIONS.value)
+            if inspection.code == inspection_name
+        ]
+
+        if len(current_issues) < len(previous_issues):
+            previous_code = getattr(previous_row, EduColumnName.CODE_SNIPPETS.value)
+            current_code = getattr(current_row, EduColumnName.CODE_SNIPPETS.value)
+            examples.append((previous_issues, previous_code, current_issues, current_code))
+
+    return examples
diff --git a/jba/src/inspections/gathering.py b/jba/src/inspections/gathering.py
@@ -0,0 +1,163 @@
+import argparse
+import json
+import logging
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import pandas as pd
+from hyperstyle.src.python.review.application_config import LanguageVersion
+from hyperstyle.src.python.review.common.language import Language
+
+from core.src.model.column_name import SubmissionColumns
+from core.src.utils.df_utils import read_df, write_df
+from core.src.utils.file.file_utils import get_output_path
+from data_labelling.src.hyperstyle.evaluate import evaluate_hyperstyle
+from data_labelling.src.hyperstyle.hyperstyle_evaluation_config import HyperstyleEvaluationConfig
+from jba.src.models.edu_columns import EduColumnName
+
+logger = logging.getLogger(__name__)
+
+
+def configure_parser(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument(
+        'submissions_path',
+        type=lambda value: Path(value).absolute(),
+        help='Path to .csv file with submissions.',
+    )
+
+    parser.add_argument(
+        '--tool-path',
+        type=lambda value: Path(value).absolute(),
+        help='Path to the Hyperstyle entry point.',
+        required=True,
+    )
+
+    parser.add_argument(
+        '--language-version',
+        choices=LanguageVersion.values(),
+        help='Language version of code snippets.',
+        required=True,
+    )
+
+    parser.add_argument(
+        '--host',
+        help='Code quality server address.',
+        required=True,
+    )
+
+    parser.add_argument(
+        '--port',
+        type=int,
+        help='Code quality server port.',
+        required=True,
+    )
+
+    parser.add_argument(
+        '--venv',
+        type=lambda value: Path(value).absolute(),
+        help='Path to venv to run the tool.',
+    )
+
+    parser.add_argument('--disable', help='List of inspectors to disable. Example: `pylint,flake8`.')
+
+    parser.add_argument(
+        '--debug',
+        help='Run the script in debug mode.',
+        action='store_true',
+    )
+
+    parser.add_argument(
+        '--script-logs-path',
+        type=lambda value: Path(value).absolute(),
+        help='Path to a file where to save script logs.',
+    )
+
+
+def _convert_submissions(submissions: pd.DataFrame, language_version: LanguageVersion) -> pd.DataFrame:
+    # Converting submissions to a dataframe that could be processed by the data_labelling module
+    df_solutions = submissions[[EduColumnName.ID.value, EduColumnName.CODE_SNIPPETS.value]]
+    df_solutions = df_solutions.dropna(subset=[EduColumnName.CODE_SNIPPETS.value])
+    df_solutions[EduColumnName.CODE_SNIPPETS.value] = df_solutions[EduColumnName.CODE_SNIPPETS.value].apply(json.loads)
+    # TODO: gather inspections from all snippets simultaneously instead of individually
+    df_solutions = df_solutions.explode(EduColumnName.CODE_SNIPPETS.value)
+
+    df_solutions[['file_path', SubmissionColumns.CODE.value]] = df_solutions[EduColumnName.CODE_SNIPPETS.value].apply(
+        lambda code_snippet: pd.Series([code_snippet['name'], code_snippet['text']])
+    )
+
+    df_solutions[SubmissionColumns.ID.value] = df_solutions.apply(
+        lambda row: f'{row[EduColumnName.ID.value]}-{row["file_path"].replace("/", "_")}',
+        axis=1,
+    )
+
+    df_solutions[SubmissionColumns.LANG.value] = language_version.value
+    df_solutions.reset_index(inplace=True)
+
+    return df_solutions
+
+
+def evaluate_submissions(
+    submissions: pd.DataFrame,
+    language_version: LanguageVersion,
+    config: HyperstyleEvaluationConfig,
+) -> pd.DataFrame:
+    df_solutions = _convert_submissions(submissions, language_version)
+
+    inspections = (
+        # Gathering inspections
+        evaluate_hyperstyle(df_solutions, config)
+        # Grouping inspections from the same submission into a dictionary
+        .groupby('index')
+        .apply(
+            lambda group: json.dumps(
+                pd.Series(
+                    group[SubmissionColumns.HYPERSTYLE_ISSUES.value].values,
+                    index=group['file_path'],
+                ).to_dict()
+            )
+        )
+        .rename(EduColumnName.INSPECTIONS.value)
+    )
+
+    return pd.concat([submissions, inspections], axis=1)
+
+
+# TODO: fix a bug when the server fails on some submissions
+def main():
+    parser = argparse.ArgumentParser()
+    configure_parser(parser)
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        filename=args.script_logs_path,
+        level=logging.DEBUG if args.debug else logging.INFO,
+        format='%(asctime)s | %(levelname)s | %(message)s',  # noqa: WPS323 You must use % here to format logger.
+        force=True,
+    )
+
+    submissions = read_df(args.submissions_path)
+
+    with TemporaryDirectory() as tmpdir:
+        config = HyperstyleEvaluationConfig(
+            tool_path=args.tool_path,
+            allow_duplicates=False,
+            with_all_categories=True,
+            new_format=False,
+            tmp_path=Path(tmpdir),
+            venv=args.venv,
+            disable=args.disable,
+            ij_config={
+                Language.from_language_version(LanguageVersion(args.language_version)).value.lower(): {
+                    'host': args.host,
+                    'port': args.port,
+                },
+            },
+        )
+
+        submissions = evaluate_submissions(submissions, LanguageVersion(args.language_version), config)
+
+    write_df(submissions, get_output_path(args.submissions_path, '-with_inspections'))
+
+
+if __name__ == '__main__':
+    main()