Skip to content

Commit

Permalink
Merge pull request #27 from hyperskill/structure-script-refactoring
Browse files Browse the repository at this point in the history
Structure script refactoring
  • Loading branch information
nbirillo authored Oct 24, 2023
2 parents ffa7c70 + 6fb2caf commit a01289f
Show file tree
Hide file tree
Showing 130 changed files with 714 additions and 170 deletions.
11 changes: 11 additions & 0 deletions core/src/utils/file/yaml_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,14 @@
def parse_yaml(path: Union[Path, str]) -> Any:
with open(path) as file:
return yaml.safe_load(file)


def read_yaml_field_content(yaml_file: Path, field_name: str) -> Any:
if not yaml_file.exists():
raise ValueError(f'{field_name} does not exist.')

parsed_yaml_file = parse_yaml(yaml_file)
if parsed_yaml_file is None:
raise ValueError(f'`{yaml_file} is empty.')

return parsed_yaml_file.get(field_name)
47 changes: 29 additions & 18 deletions jba/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,29 +26,40 @@ docker run hyperstyle-analysis-prod:<VERSION> poetry run prepare_course_data [ar

- `data_path` — Path to .csv file with collected data. The file must contain the following columns: `task_id`, `course_id`, `submission_datetime`, `status`, `task_type`, `user_id`, `task_name` (see [an example](tests/resources/processing/all_data.csv) in the tests).
- `course_id` — Course id to analyze.
- `course_sources_path` — Path to course sources to extract course structure (see [an example](tests/resources/processing/course_example) in the tests).
- `course_sources_path` — Path to course sources to extract course structure.

After this step you will get a new file with `courseId` suffix. This file will contain all data from the `data_path` file, but only for the course with the course id `course_id`.
Also, an additional file with the course structure will be generated, e.g. for the [course](tests/resources/processing/course_example) from the test folder with the following structure:
Also, an additional file with the course structure will be generated, e.g. for the [course](tests/resources/processing/prepare_course_data/course_with_section) from the test folder with the following structure:
```text
- course_root
-- course-info.yaml
-- course-remote-info.yaml
-- section
--- section-info.yaml
--- section-remote-info.yaml
--- lesson
---- lesson-info.yaml
---- lesson-remote-info.yaml
---- task1
----- task-info.yaml
----- task-remote-info.yaml
---- task2
----- task-info.yaml
----- task-remote-info.yaml
course-info.yaml
course-remote-info.yaml
section/
├── section-info.yaml
├── section-remote-info.yaml
└── lesson/
├── lesson-info.yaml
├── lesson-remote-info.yaml
├── task1/
│ ├── task-info.yaml
│ ├── task-remote-info.yaml
│ ├── src/
│ │ └── ...
│ └── task.md
├── task2/
│ ├── task-info.yaml
│ ├── task-remote-info.yaml
│ ├── task.md
│ └── src/
│ └── ...
└── task3/
├── task-info.yaml
├── task-remote-info.yaml
├── task.md
└── src/
└── ...
```

the [following](tests/resources/processing/course_1_structure_expected.csv) file will be generated.
the [following](tests/resources/processing/prepare_course_data/expected_course_with_section.csv) file will be generated.

2. [data_processing.py](src/processing/data_processing.py) allows you to process data from the previous step:
- Merge course data with task info
Expand Down
37 changes: 22 additions & 15 deletions jba/src/models/edu_columns.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
from enum import Enum, unique

from jba.src.models.edu_structure import EduStructureType

ID_COLUMN_POSTFIX = 'id'
NAME_COLUMN_POSTFIX = 'name'
NUMBER_COLUMN_POSTFIX = 'number'
AMOUNT_COLUMN_POSTFIX = 'amount'


@unique
class EduColumnName(Enum):
Expand All @@ -12,21 +19,21 @@ class EduColumnName(Enum):
CODE_SNIPPETS = 'code_snippets'
UUID = 'uuid'

TASK_ID = 'task_id'
TASK_GLOBAL_NUMBER = 'task_global_number'
TASK_NAME = 'task_name'
TASK_NUMBER = 'task_number'
TASKS_AMOUNT = 'tasks_amount'

LESSON_ID = 'lesson_id'
LESSON_NAME = 'lesson_name'
LESSON_NUMBER = 'lesson_number'
LESSONS_AMOUNT = 'lessons_amount'

SECTION_ID = 'section_id'
SECTION_NAME = 'section_name'
SECTION_NUMBER = 'section_number'
SECTIONS_AMOUNT = 'sections_amount'
TASK_GLOBAL_NUMBER = f'{EduStructureType.TASK.value}_global_number'
TASK_ID = f'{EduStructureType.TASK.value}_{ID_COLUMN_POSTFIX}'
TASK_NAME = f'{EduStructureType.TASK.value}_{NAME_COLUMN_POSTFIX}'
TASK_NUMBER = f'{EduStructureType.TASK.value}_{NUMBER_COLUMN_POSTFIX}'
TASK_AMOUNT = f'{EduStructureType.TASK.value}_{AMOUNT_COLUMN_POSTFIX}'

LESSON_ID = f'{EduStructureType.LESSON.value}_{ID_COLUMN_POSTFIX}'
LESSON_NAME = f'{EduStructureType.LESSON.value}_{NAME_COLUMN_POSTFIX}'
LESSON_NUMBER = f'{EduStructureType.LESSON.value}_{NUMBER_COLUMN_POSTFIX}'
LESSON_AMOUNT = f'{EduStructureType.LESSON.value}_{AMOUNT_COLUMN_POSTFIX}'

SECTION_ID = f'{EduStructureType.SECTION.value}_{ID_COLUMN_POSTFIX}'
SECTION_NAME = f'{EduStructureType.SECTION.value}_{NAME_COLUMN_POSTFIX}'
SECTION_NUMBER = f'{EduStructureType.SECTION.value}_{NUMBER_COLUMN_POSTFIX}'
SECTION_AMOUNT = f'{EduStructureType.SECTION.value}_{AMOUNT_COLUMN_POSTFIX}'

SOLUTION_AWS_KEY = 'solution_aws_key'
FORMAT_VERSION = 'format_version'
Expand Down
10 changes: 0 additions & 10 deletions jba/src/models/edu_config_item.py

This file was deleted.

18 changes: 18 additions & 0 deletions jba/src/models/edu_structure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from dataclasses import dataclass
from enum import Enum
from typing import List, Optional


class EduStructureType(Enum):
COURSE = 'course'
SECTION = 'section'
LESSON = 'lesson'
TASK = 'task'


@dataclass(frozen=True)
class EduStructureNode:
id: int
name: str
structure_type: EduStructureType
children: Optional[List['EduStructureNode']]
195 changes: 108 additions & 87 deletions jba/src/processing/prepare_course_data.py
Original file line number Diff line number Diff line change
@@ -1,119 +1,140 @@
import re
from os import listdir

import argparse
from os.path import exists
from pathlib import Path
from typing import Any

import pandas as pd
import sys

from core.src.utils.df_utils import read_df, filter_df_by_single_value, write_df
from core.src.utils.file.extension_utils import AnalysisExtension
from core.src.utils.file.file_utils import get_parent_folder, remove_slash
from core.src.utils.file.yaml_utils import parse_yaml
from jba.src.models.edu_columns import EduColumnName
from jba.src.models.edu_config_item import EduConfigItem
from core.src.utils.file.file_utils import get_output_path
from core.src.utils.file.yaml_utils import read_yaml_field_content
from jba.src.models.edu_columns import (
EduColumnName,
NUMBER_COLUMN_POSTFIX,
AMOUNT_COLUMN_POSTFIX,
ID_COLUMN_POSTFIX,
NAME_COLUMN_POSTFIX,
)
from jba.src.models.edu_structure import EduStructureNode, EduStructureType

CONTENT_META_FIELD = 'content'
ID_META_FIELD = 'id'

CONTENT_FIELD = 'content'
ID_FIELD = 'id'
INFO_FILE_REGEX = re.compile(f'([a-z]+)-info{AnalysisExtension.YAML.value}')
REMOTE_INFO_FILE_REGEX = re.compile(f'([a-z]+)-remote-info{AnalysisExtension.YAML.value}')


def parse_course_config(base_path: str, inner_folder: str) -> dict:
full_path = f'{base_path}/{inner_folder}'
if not exists(full_path):
raise ValueError(f'The {inner_folder} does not exist in {base_path}!')
return parse_yaml(full_path)
def filter_by_course_id_and_save(df_path: Path, course_id: int):
initial_df = read_df(df_path)
filtered_df = filter_df_by_single_value(initial_df, EduColumnName.COURSE_ID.value, course_id)
write_df(filtered_df, df_path.parent / f'course_{course_id}{AnalysisExtension.CSV.value}')


def _parse_yaml_section(yaml_config_parsed: dict, yaml_section: str, file_path: str) -> Any:
if yaml_section not in yaml_config_parsed:
raise ValueError(f'You need to specify {yaml_section} section in the {file_path} file!')
return yaml_config_parsed[yaml_section]
def _gather_structure(root: Path) -> EduStructureNode: # noqa: WPS238
file_names = listdir(root)

info_files = list(filter(lambda file_name: re.match(INFO_FILE_REGEX, file_name), file_names))
if len(info_files) != 1:
raise ValueError(f'The number of info files in {root} must be exactly 1 (actual: {len(info_files)}).')

def _parse_edu_config_item(base_path: str, nested_folder_name: str, item_type: str,
is_terminal: bool = False) -> EduConfigItem:
current_path = f'{base_path}/{nested_folder_name}'
info_file = info_files[0]
info_file_structure_type = re.match(INFO_FILE_REGEX, info_file).group(1)

item_remote_info_file_name = f'{item_type}-remote-info{AnalysisExtension.YAML.value}'
item_remote_info_parsed = parse_course_config(current_path, item_remote_info_file_name)
item_id = _parse_yaml_section(item_remote_info_parsed, ID_FIELD,
f'{current_path}/{item_remote_info_file_name}')
remote_info_files = list(filter(lambda file_name: re.match(REMOTE_INFO_FILE_REGEX, file_name), file_names))
if len(remote_info_files) != 1:
raise ValueError(
f'The number of remote info files in {root} must be exactly 1 (actual: {len(remote_info_files)}).',
)

if is_terminal:
nested_items = None
else:
item_info_file_name = f'{item_type}-info{AnalysisExtension.YAML.value}'
item_info_parsed = parse_course_config(current_path, item_info_file_name)
nested_items = _parse_yaml_section(item_info_parsed, CONTENT_FIELD,
f'{current_path}/{item_info_file_name}')
remote_info_file = remote_info_files[0]
remote_info_file_structure_type = re.match(REMOTE_INFO_FILE_REGEX, remote_info_file).group(1)

return EduConfigItem(item_id, nested_folder_name, item_type, nested_items)
if info_file_structure_type != remote_info_file_structure_type:
raise ValueError(f'Unable to determine a structure type for {root}.')

structure_type = EduStructureType(info_file_structure_type)

def filter_by_course_id_and_save(df_path: str, course_id: int) -> Path:
initial_df = read_df(df_path)
filtered_df = filter_df_by_single_value(initial_df, EduColumnName.COURSE_ID.value, course_id)
output_folder = get_parent_folder(df_path, to_add_slash=True)
new_path = f'{output_folder}/course_{course_id}{AnalysisExtension.CSV.value}'
write_df(filtered_df, new_path)
return output_folder


def _gather_course_structure(course_root_path: str) -> pd.DataFrame:
course_info_file_name = f'course-info{AnalysisExtension.YAML.value}'
course_root_path_without_slash = remove_slash(course_root_path)
course_info_parsed = parse_course_config(course_root_path_without_slash, course_info_file_name)
sections = _parse_yaml_section(course_info_parsed, CONTENT_FIELD, course_root_path_without_slash)
headers = [
EduColumnName.TASK_GLOBAL_NUMBER.value,
EduColumnName.TASK_ID.value, EduColumnName.TASK_NAME.value, EduColumnName.TASK_NUMBER.value,
EduColumnName.TASKS_AMOUNT.value,
EduColumnName.LESSON_ID.value, EduColumnName.LESSON_NAME.value, EduColumnName.LESSON_NUMBER.value,
EduColumnName.LESSONS_AMOUNT.value,
EduColumnName.SECTION_ID.value, EduColumnName.SECTION_NAME.value, EduColumnName.SECTION_NUMBER.value,
EduColumnName.SECTIONS_AMOUNT.value,
]
rows = []
sections_amount = len(sections)
task_global_number = 0
for i, section in enumerate(sections):
section_parsed = _parse_edu_config_item(course_root_path_without_slash, section, 'section')
current_section_path = f'{course_root_path_without_slash}/{section}'
lessons_amount = len(section_parsed.nested_items)
for j, lesson in enumerate(section_parsed.nested_items):
lesson_parsed = _parse_edu_config_item(current_section_path, lesson, 'lesson')
current_lesson_path = f'{current_section_path}/{lesson}'
tasks_amount = len(lesson_parsed.nested_items)
for k, task in enumerate(lesson_parsed.nested_items):
task_parsed = _parse_edu_config_item(current_lesson_path, task, 'task', is_terminal=True)
# task_global_number
# task_id, task_name, task_number, tasks_amount,
# lesson_id, lesson_name, lesson_number, lessons_amount,
# section_id, section_name, section_number, sections_amount
task_global_number += 1
rows.append([
task_global_number,
task_parsed.id, task_parsed.name, k + 1, tasks_amount,
lesson_parsed.id, lesson_parsed.name, j + 1, lessons_amount,
section_parsed.id, section_parsed.name, i + 1, sections_amount
])
return pd.DataFrame(rows, columns=headers)
structure_id = read_yaml_field_content(root / remote_info_file, ID_META_FIELD)
if structure_id is None:
raise ValueError(f'{root / remote_info_file} must contain the {ID_META_FIELD} field.')

children = None
content = read_yaml_field_content(root / info_file, CONTENT_META_FIELD)
if content is not None:
children = [_gather_structure(root / name) for name in content]

if not all([node.structure_type == children[0].structure_type for node in children]):
raise ValueError(f'All children nodes inside {root} must have the same structure type.')

return EduStructureNode(structure_id, root.name, structure_type, children)


def _convert_structure_to_dataframe(structure: EduStructureNode) -> pd.DataFrame:
if structure.children is None:
# If node has no content, then it is a task node
return pd.DataFrame.from_dict(
{f'{EduColumnName.TASK_ID.value}': [structure.id], f'{EduColumnName.TASK_NAME.value}': [structure.name]}
)

children_dfs = []
for i, node in enumerate(structure.children, start=1):
node_df = _convert_structure_to_dataframe(node)
node_df[f'{node.structure_type.value}_{NUMBER_COLUMN_POSTFIX}'] = i
node_df[f'{node.structure_type.value}_{AMOUNT_COLUMN_POSTFIX}'] = len(structure.children)
children_dfs.append(node_df)

structure_df = pd.concat(children_dfs, ignore_index=True)
structure_df[f'{structure.structure_type.value}_{ID_COLUMN_POSTFIX}'] = structure.id
structure_df[f'{structure.structure_type.value}_{NAME_COLUMN_POSTFIX}'] = structure.name

return structure_df


def get_course_structure(course_root: Path) -> pd.DataFrame:
course_structure = _gather_structure(course_root)
course_structure_df = _convert_structure_to_dataframe(course_structure)

# Removing unnecessary column
course_structure_df.drop(
columns=[f'{EduStructureType.COURSE.value}_{NAME_COLUMN_POSTFIX}', EduColumnName.COURSE_ID.value],
inplace=True,
)

# Adding the "task global number" column
course_structure_df.index += 1
course_structure_df.reset_index(inplace=True, names=[EduColumnName.TASK_GLOBAL_NUMBER.value])

return course_structure_df


def configure_parser(parser: argparse.ArgumentParser) -> None:
parser.add_argument('data_path', type=str, help='Path to .csv file with collected data.')
parser.add_argument(
'submissions_path',
type=lambda value: Path(value).absolute(),
help='Path to .csv file with collected data.',
)

parser.add_argument('course_id', type=int, help='Course id to analyze.')
parser.add_argument('course_sources_path', type=str, help='Path to course sources to extract course structure.')

parser.add_argument(
'course_sources_path',
type=lambda value: Path(value).absolute(),
help='Path to course sources to extract course structure.',
)


def main():
parser = argparse.ArgumentParser()
configure_parser(parser)

args = parser.parse_args(sys.argv[1:])
output_path = filter_by_course_id_and_save(args.data_path, args.course_id)
tasks_info_df = _gather_course_structure(args.course_sources_path)
write_df(tasks_info_df, f'{output_path}/course_{args.course_id}_structure{AnalysisExtension.CSV.value}')
args = parser.parse_args()

filter_by_course_id_and_save(args.data_path, args.course_id)

course_structure = get_course_structure(args.course_sources_path)
write_df(course_structure, get_output_path(args.data_path, '-with-structure'))


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit a01289f

Please sign in to comment.