Merge pull request #27 from hyperskill/structure-script-refactoring

Structure script refactoring
hyperskill · Oct 24, 2023 · a01289f · a01289f
2 parents ffa7c70 + 6fb2caf
commit a01289f
Show file tree

Hide file tree

Showing 130 changed files with 714 additions and 170 deletions.
diff --git a/core/src/utils/file/yaml_utils.py b/core/src/utils/file/yaml_utils.py
@@ -7,3 +7,14 @@
 def parse_yaml(path: Union[Path, str]) -> Any:
     with open(path) as file:
         return yaml.safe_load(file)
+
+
+def read_yaml_field_content(yaml_file: Path, field_name: str) -> Any:
+    if not yaml_file.exists():
+        raise ValueError(f'{field_name} does not exist.')
+
+    parsed_yaml_file = parse_yaml(yaml_file)
+    if parsed_yaml_file is None:
+        raise ValueError(f'`{yaml_file} is empty.')
+
+    return parsed_yaml_file.get(field_name)
diff --git a/jba/README.md b/jba/README.md
@@ -26,29 +26,40 @@ docker run hyperstyle-analysis-prod:<VERSION> poetry run prepare_course_data [ar
 
 - `data_path` — Path to .csv file with collected data. The file must contain the following columns: `task_id`, `course_id`, `submission_datetime`, `status`, `task_type`, `user_id`, `task_name` (see [an example](tests/resources/processing/all_data.csv) in the tests).
 - `course_id` — Course id to analyze.
-- `course_sources_path` — Path to course sources to extract course structure (see [an example](tests/resources/processing/course_example) in the tests).
+- `course_sources_path` — Path to course sources to extract course structure.
 
 After this step you will get a new file with `courseId` suffix. This file will contain all data from the `data_path` file, but only for the course with the course id `course_id`.
-Also, an additional file with the course structure will be generated, e.g. for the [course](tests/resources/processing/course_example) from the test folder with the following structure:
+Also, an additional file with the course structure will be generated, e.g. for the [course](tests/resources/processing/prepare_course_data/course_with_section) from the test folder with the following structure:
 ```text
-- course_root
--- course-info.yaml
--- course-remote-info.yaml
--- section
---- section-info.yaml
---- section-remote-info.yaml
---- lesson
----- lesson-info.yaml
----- lesson-remote-info.yaml
----- task1
------ task-info.yaml
------ task-remote-info.yaml
----- task2
------ task-info.yaml
------ task-remote-info.yaml
+course-info.yaml
+course-remote-info.yaml
+section/
+├── section-info.yaml
+├── section-remote-info.yaml
+└── lesson/
+    ├── lesson-info.yaml
+    ├── lesson-remote-info.yaml
+    ├── task1/
+    │   ├── task-info.yaml
+    │   ├── task-remote-info.yaml
+    │   ├── src/
+    │   │   └── ...
+    │   └── task.md
+    ├── task2/
+    │   ├── task-info.yaml
+    │   ├── task-remote-info.yaml
+    │   ├── task.md
+    │   └── src/
+    │       └── ...
+    └── task3/
+        ├── task-info.yaml
+        ├── task-remote-info.yaml
+        ├── task.md
+        └── src/
+            └── ...
 ```
 
-the [following](tests/resources/processing/course_1_structure_expected.csv) file will be generated.
+the [following](tests/resources/processing/prepare_course_data/expected_course_with_section.csv) file will be generated.
 
 2. [data_processing.py](src/processing/data_processing.py) allows you to process data from the previous step:
 - Merge course data with task info

diff --git a/jba/src/models/edu_columns.py b/jba/src/models/edu_columns.py
@@ -1,5 +1,12 @@
 from enum import Enum, unique
 
+from jba.src.models.edu_structure import EduStructureType
+
+ID_COLUMN_POSTFIX = 'id'
+NAME_COLUMN_POSTFIX = 'name'
+NUMBER_COLUMN_POSTFIX = 'number'
+AMOUNT_COLUMN_POSTFIX = 'amount'
+
 
 @unique
 class EduColumnName(Enum):
@@ -12,21 +19,21 @@ class EduColumnName(Enum):
     CODE_SNIPPETS = 'code_snippets'
     UUID = 'uuid'
 
-    TASK_ID = 'task_id'
-    TASK_GLOBAL_NUMBER = 'task_global_number'
-    TASK_NAME = 'task_name'
-    TASK_NUMBER = 'task_number'
-    TASKS_AMOUNT = 'tasks_amount'
-
-    LESSON_ID = 'lesson_id'
-    LESSON_NAME = 'lesson_name'
-    LESSON_NUMBER = 'lesson_number'
-    LESSONS_AMOUNT = 'lessons_amount'
-
-    SECTION_ID = 'section_id'
-    SECTION_NAME = 'section_name'
-    SECTION_NUMBER = 'section_number'
-    SECTIONS_AMOUNT = 'sections_amount'
+    TASK_GLOBAL_NUMBER = f'{EduStructureType.TASK.value}_global_number'
+    TASK_ID = f'{EduStructureType.TASK.value}_{ID_COLUMN_POSTFIX}'
+    TASK_NAME = f'{EduStructureType.TASK.value}_{NAME_COLUMN_POSTFIX}'
+    TASK_NUMBER = f'{EduStructureType.TASK.value}_{NUMBER_COLUMN_POSTFIX}'
+    TASK_AMOUNT = f'{EduStructureType.TASK.value}_{AMOUNT_COLUMN_POSTFIX}'
+
+    LESSON_ID = f'{EduStructureType.LESSON.value}_{ID_COLUMN_POSTFIX}'
+    LESSON_NAME = f'{EduStructureType.LESSON.value}_{NAME_COLUMN_POSTFIX}'
+    LESSON_NUMBER = f'{EduStructureType.LESSON.value}_{NUMBER_COLUMN_POSTFIX}'
+    LESSON_AMOUNT = f'{EduStructureType.LESSON.value}_{AMOUNT_COLUMN_POSTFIX}'
+
+    SECTION_ID = f'{EduStructureType.SECTION.value}_{ID_COLUMN_POSTFIX}'
+    SECTION_NAME = f'{EduStructureType.SECTION.value}_{NAME_COLUMN_POSTFIX}'
+    SECTION_NUMBER = f'{EduStructureType.SECTION.value}_{NUMBER_COLUMN_POSTFIX}'
+    SECTION_AMOUNT = f'{EduStructureType.SECTION.value}_{AMOUNT_COLUMN_POSTFIX}'
 
     SOLUTION_AWS_KEY = 'solution_aws_key'
     FORMAT_VERSION = 'format_version'

diff --git a/jba/src/models/edu_config_item.py b/jba/src/models/edu_config_item.py
diff --git a/jba/src/models/edu_structure.py b/jba/src/models/edu_structure.py
@@ -0,0 +1,18 @@
+from dataclasses import dataclass
+from enum import Enum
+from typing import List, Optional
+
+
+class EduStructureType(Enum):
+    COURSE = 'course'
+    SECTION = 'section'
+    LESSON = 'lesson'
+    TASK = 'task'
+
+
+@dataclass(frozen=True)
+class EduStructureNode:
+    id: int
+    name: str
+    structure_type: EduStructureType
+    children: Optional[List['EduStructureNode']]
diff --git a/jba/src/processing/prepare_course_data.py b/jba/src/processing/prepare_course_data.py
@@ -1,119 +1,140 @@
+import re
+from os import listdir
+
 import argparse
-from os.path import exists
 from pathlib import Path
-from typing import Any
 
 import pandas as pd
-import sys
 
 from core.src.utils.df_utils import read_df, filter_df_by_single_value, write_df
 from core.src.utils.file.extension_utils import AnalysisExtension
-from core.src.utils.file.file_utils import get_parent_folder, remove_slash
-from core.src.utils.file.yaml_utils import parse_yaml
-from jba.src.models.edu_columns import EduColumnName
-from jba.src.models.edu_config_item import EduConfigItem
+from core.src.utils.file.file_utils import get_output_path
+from core.src.utils.file.yaml_utils import read_yaml_field_content
+from jba.src.models.edu_columns import (
+    EduColumnName,
+    NUMBER_COLUMN_POSTFIX,
+    AMOUNT_COLUMN_POSTFIX,
+    ID_COLUMN_POSTFIX,
+    NAME_COLUMN_POSTFIX,
+)
+from jba.src.models.edu_structure import EduStructureNode, EduStructureType
+
+CONTENT_META_FIELD = 'content'
+ID_META_FIELD = 'id'
 
-CONTENT_FIELD = 'content'
-ID_FIELD = 'id'
+INFO_FILE_REGEX = re.compile(f'([a-z]+)-info{AnalysisExtension.YAML.value}')
+REMOTE_INFO_FILE_REGEX = re.compile(f'([a-z]+)-remote-info{AnalysisExtension.YAML.value}')
 
 
-def parse_course_config(base_path: str, inner_folder: str) -> dict:
-    full_path = f'{base_path}/{inner_folder}'
-    if not exists(full_path):
-        raise ValueError(f'The {inner_folder} does not exist in {base_path}!')
-    return parse_yaml(full_path)
+def filter_by_course_id_and_save(df_path: Path, course_id: int):
+    initial_df = read_df(df_path)
+    filtered_df = filter_df_by_single_value(initial_df, EduColumnName.COURSE_ID.value, course_id)
+    write_df(filtered_df, df_path.parent / f'course_{course_id}{AnalysisExtension.CSV.value}')
 
 
-def _parse_yaml_section(yaml_config_parsed: dict, yaml_section: str, file_path: str) -> Any:
-    if yaml_section not in yaml_config_parsed:
-        raise ValueError(f'You need to specify {yaml_section} section in the {file_path} file!')
-    return yaml_config_parsed[yaml_section]
+def _gather_structure(root: Path) -> EduStructureNode:  # noqa: WPS238
+    file_names = listdir(root)
 
+    info_files = list(filter(lambda file_name: re.match(INFO_FILE_REGEX, file_name), file_names))
+    if len(info_files) != 1:
+        raise ValueError(f'The number of info files in {root} must be exactly 1 (actual: {len(info_files)}).')
 
-def _parse_edu_config_item(base_path: str, nested_folder_name: str, item_type: str,
-                           is_terminal: bool = False) -> EduConfigItem:
-    current_path = f'{base_path}/{nested_folder_name}'
+    info_file = info_files[0]
+    info_file_structure_type = re.match(INFO_FILE_REGEX, info_file).group(1)
 
-    item_remote_info_file_name = f'{item_type}-remote-info{AnalysisExtension.YAML.value}'
-    item_remote_info_parsed = parse_course_config(current_path, item_remote_info_file_name)
-    item_id = _parse_yaml_section(item_remote_info_parsed, ID_FIELD,
-                                  f'{current_path}/{item_remote_info_file_name}')
+    remote_info_files = list(filter(lambda file_name: re.match(REMOTE_INFO_FILE_REGEX, file_name), file_names))
+    if len(remote_info_files) != 1:
+        raise ValueError(
+            f'The number of remote info files in {root} must be exactly 1 (actual: {len(remote_info_files)}).',
+        )
 
-    if is_terminal:
-        nested_items = None
-    else:
-        item_info_file_name = f'{item_type}-info{AnalysisExtension.YAML.value}'
-        item_info_parsed = parse_course_config(current_path, item_info_file_name)
-        nested_items = _parse_yaml_section(item_info_parsed, CONTENT_FIELD,
-                                           f'{current_path}/{item_info_file_name}')
+    remote_info_file = remote_info_files[0]
+    remote_info_file_structure_type = re.match(REMOTE_INFO_FILE_REGEX, remote_info_file).group(1)
 
-    return EduConfigItem(item_id, nested_folder_name, item_type, nested_items)
+    if info_file_structure_type != remote_info_file_structure_type:
+        raise ValueError(f'Unable to determine a structure type for {root}.')
 
+    structure_type = EduStructureType(info_file_structure_type)
 
-def filter_by_course_id_and_save(df_path: str, course_id: int) -> Path:
-    initial_df = read_df(df_path)
-    filtered_df = filter_df_by_single_value(initial_df, EduColumnName.COURSE_ID.value, course_id)
-    output_folder = get_parent_folder(df_path, to_add_slash=True)
-    new_path = f'{output_folder}/course_{course_id}{AnalysisExtension.CSV.value}'
-    write_df(filtered_df, new_path)
-    return output_folder
-
-
-def _gather_course_structure(course_root_path: str) -> pd.DataFrame:
-    course_info_file_name = f'course-info{AnalysisExtension.YAML.value}'
-    course_root_path_without_slash = remove_slash(course_root_path)
-    course_info_parsed = parse_course_config(course_root_path_without_slash, course_info_file_name)
-    sections = _parse_yaml_section(course_info_parsed, CONTENT_FIELD, course_root_path_without_slash)
-    headers = [
-        EduColumnName.TASK_GLOBAL_NUMBER.value,
-        EduColumnName.TASK_ID.value, EduColumnName.TASK_NAME.value, EduColumnName.TASK_NUMBER.value,
-        EduColumnName.TASKS_AMOUNT.value,
-        EduColumnName.LESSON_ID.value, EduColumnName.LESSON_NAME.value, EduColumnName.LESSON_NUMBER.value,
-        EduColumnName.LESSONS_AMOUNT.value,
-        EduColumnName.SECTION_ID.value, EduColumnName.SECTION_NAME.value, EduColumnName.SECTION_NUMBER.value,
-        EduColumnName.SECTIONS_AMOUNT.value,
-    ]
-    rows = []
-    sections_amount = len(sections)
-    task_global_number = 0
-    for i, section in enumerate(sections):
-        section_parsed = _parse_edu_config_item(course_root_path_without_slash, section, 'section')
-        current_section_path = f'{course_root_path_without_slash}/{section}'
-        lessons_amount = len(section_parsed.nested_items)
-        for j, lesson in enumerate(section_parsed.nested_items):
-            lesson_parsed = _parse_edu_config_item(current_section_path, lesson, 'lesson')
-            current_lesson_path = f'{current_section_path}/{lesson}'
-            tasks_amount = len(lesson_parsed.nested_items)
-            for k, task in enumerate(lesson_parsed.nested_items):
-                task_parsed = _parse_edu_config_item(current_lesson_path, task, 'task', is_terminal=True)
-                # task_global_number
-                # task_id, task_name, task_number, tasks_amount,
-                # lesson_id, lesson_name, lesson_number, lessons_amount,
-                # section_id, section_name, section_number, sections_amount
-                task_global_number += 1
-                rows.append([
-                    task_global_number,
-                    task_parsed.id, task_parsed.name, k + 1, tasks_amount,
-                    lesson_parsed.id, lesson_parsed.name, j + 1, lessons_amount,
-                    section_parsed.id, section_parsed.name, i + 1, sections_amount
-                ])
-    return pd.DataFrame(rows, columns=headers)
+    structure_id = read_yaml_field_content(root / remote_info_file, ID_META_FIELD)
+    if structure_id is None:
+        raise ValueError(f'{root / remote_info_file} must contain the {ID_META_FIELD} field.')
+
+    children = None
+    content = read_yaml_field_content(root / info_file, CONTENT_META_FIELD)
+    if content is not None:
+        children = [_gather_structure(root / name) for name in content]
+
+        if not all([node.structure_type == children[0].structure_type for node in children]):
+            raise ValueError(f'All children nodes inside {root} must have the same structure type.')
+
+    return EduStructureNode(structure_id, root.name, structure_type, children)
+
+
+def _convert_structure_to_dataframe(structure: EduStructureNode) -> pd.DataFrame:
+    if structure.children is None:
+        # If node has no content, then it is a task node
+        return pd.DataFrame.from_dict(
+            {f'{EduColumnName.TASK_ID.value}': [structure.id], f'{EduColumnName.TASK_NAME.value}': [structure.name]}
+        )
+
+    children_dfs = []
+    for i, node in enumerate(structure.children, start=1):
+        node_df = _convert_structure_to_dataframe(node)
+        node_df[f'{node.structure_type.value}_{NUMBER_COLUMN_POSTFIX}'] = i
+        node_df[f'{node.structure_type.value}_{AMOUNT_COLUMN_POSTFIX}'] = len(structure.children)
+        children_dfs.append(node_df)
+
+    structure_df = pd.concat(children_dfs, ignore_index=True)
+    structure_df[f'{structure.structure_type.value}_{ID_COLUMN_POSTFIX}'] = structure.id
+    structure_df[f'{structure.structure_type.value}_{NAME_COLUMN_POSTFIX}'] = structure.name
+
+    return structure_df
+
+
+def get_course_structure(course_root: Path) -> pd.DataFrame:
+    course_structure = _gather_structure(course_root)
+    course_structure_df = _convert_structure_to_dataframe(course_structure)
+
+    # Removing unnecessary column
+    course_structure_df.drop(
+        columns=[f'{EduStructureType.COURSE.value}_{NAME_COLUMN_POSTFIX}', EduColumnName.COURSE_ID.value],
+        inplace=True,
+    )
+
+    # Adding the "task global number" column
+    course_structure_df.index += 1
+    course_structure_df.reset_index(inplace=True, names=[EduColumnName.TASK_GLOBAL_NUMBER.value])
+
+    return course_structure_df
 
 
 def configure_parser(parser: argparse.ArgumentParser) -> None:
-    parser.add_argument('data_path', type=str, help='Path to .csv file with collected data.')
+    parser.add_argument(
+        'submissions_path',
+        type=lambda value: Path(value).absolute(),
+        help='Path to .csv file with collected data.',
+    )
+
     parser.add_argument('course_id', type=int, help='Course id to analyze.')
-    parser.add_argument('course_sources_path', type=str, help='Path to course sources to extract course structure.')
+
+    parser.add_argument(
+        'course_sources_path',
+        type=lambda value: Path(value).absolute(),
+        help='Path to course sources to extract course structure.',
+    )
 
 
 def main():
     parser = argparse.ArgumentParser()
     configure_parser(parser)
 
-    args = parser.parse_args(sys.argv[1:])
-    output_path = filter_by_course_id_and_save(args.data_path, args.course_id)
-    tasks_info_df = _gather_course_structure(args.course_sources_path)
-    write_df(tasks_info_df, f'{output_path}/course_{args.course_id}_structure{AnalysisExtension.CSV.value}')
+    args = parser.parse_args()
+
+    filter_by_course_id_and_save(args.data_path, args.course_id)
+
+    course_structure = get_course_structure(args.course_sources_path)
+    write_df(course_structure, get_output_path(args.data_path, '-with-structure'))
 
 
 if __name__ == '__main__':