Skip to content

Commit

Permalink
Merge pull request #1612 from dodona-edu/csv-template-parse
Browse files Browse the repository at this point in the history
Add support for template files into CSV parsing
  • Loading branch information
rien authored Oct 16, 2024
2 parents 44f9b33 + 1277c1d commit ce7ab2f
Show file tree
Hide file tree
Showing 5 changed files with 180 additions and 99 deletions.
1 change: 1 addition & 0 deletions core/src/file/file.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ export interface ExtraInfo {
exerciseID: string;
createdAt: Date;
labels: string;
ignored: string;
}

/**
Expand Down
141 changes: 141 additions & 0 deletions lib/src/lib/dataset.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import { readFiles, readPath } from "./reader.js";

import { ExtraInfo, File, Result } from "@dodona/dolos-core";
import { csvParse, DSVRowString } from "d3-dsv";

import { constants } from "node:fs";
import fs from "node:fs/promises";
import path from "node:path";
import { spawnSync as spawn } from "node:child_process";
import { tmpdir } from "node:os";

export class Dataset {
constructor(
public name: string,
public files: File[],
public ignore?: File) {
}

private static async fromDirectory(dirPath: string): Promise<Result<File[]>> {
const dirs = [dirPath];
const files = [];

let i = 0;

while(i < dirs.length) {
for (const entry of await fs.readdir(dirs[i], { withFileTypes: true })) {
if (entry.isDirectory()) {
dirs.push(path.join(dirs[i], entry.name));
} else if (entry.isFile()) {
files.push(readPath(path.join(dirs[i], entry.name)));
}
}
i += 1;
}

return await Result.all(files);
}

private static async setIgnoredFile(resolvedFiles: File[], ignore?: string): Promise<File | undefined> {
const ignoredFiles = resolvedFiles.filter(file => file.extra?.ignored === "true");
if (ignoredFiles.length > 1) {
throw new Error(
"More than one file has the ignored field set to true. " +
"Only one template/boilerplate code file is allowed at this moment."
);
}
else if (ignore) {
return (await readPath(ignore)).ok();
}
return ignoredFiles.length === 1 ? ignoredFiles[0] : undefined;
}


private static async fromZIP(
zipPath: string,
ignore?: string
): Promise<Dataset> {
const tmpDir = await fs.mkdtemp(path.join(tmpdir(), "dolos-unzip-"));
try {
const { status, error, stderr } = spawn("unzip", [zipPath, "-d", tmpDir]);
if (error) {
throw error;
} else if (status != 0) {
throw new Error(`Unzipping failed with exit status ${ status }, stderr: \n${stderr}`);
}
const infoPath = path.join(tmpDir, "info.csv");
if (await fs.access(infoPath, constants.R_OK).then(() => true).catch(() => false)) {
const dataset = await Dataset.fromCSV(infoPath, ignore);
if (dataset) {
dataset.name = path.basename(zipPath, ".zip");
return dataset;
}
else {
throw new Error("Failed to process files");
}
} else {
const files = (await this.fromDirectory(tmpDir)).ok();
const ignoredFile = undefined;
const nameCandidate = path.basename(zipPath, ".zip");
return new Dataset(nameCandidate, files, ignoredFile);
}
} finally {
await fs.rm(tmpDir, { recursive: true });
}
}


private static async fromCSV(
infoPath: string,
ignore?: string
): Promise<Dataset> {
const dirname = path.dirname(infoPath);
try {
const csv_files = csvParse((await fs.readFile(infoPath)).toString())
.map((row: DSVRowString) => ({
filename: row.filename as string,
fullName: row.full_name as string,
id: row.id as string,
status: row.status as string,
submissionID: row.submission_id as string,
nameEN: row.name_en as string,
nameNL: row.name_nl as string,
exerciseID: row.exercise_id as string,
createdAt: new Date(row.created_at as string),
labels: row.label as string || row.labels as string,
ignored: row.ignored as string
}))
.map((row: ExtraInfo) => readPath(path.join(dirname, row.filename), row));
const resolvedFiles = await Result.all(csv_files);
const ignoredFile = await this.setIgnoredFile(resolvedFiles.ok(), ignore);
const files = resolvedFiles.ok().filter(file => file.extra?.ignored !== "true");
const nameCandidate = path.dirname(infoPath).split(path.sep).pop() || "undefined";
return new Dataset(nameCandidate, files, ignoredFile);
} catch(e) {
throw new Error("The given '.csv'-file could not be opened");
}
}


public static async create(paths: string[], ignore?: string): Promise<Dataset> {
let resolvedIgnoredFile = null;
let resolvedFiles = null;
let nameCandidate = "undefined";

if (paths.length == 1) {
const inputFile = paths[0];
if (inputFile.toLowerCase().endsWith(".zip")) {
return Dataset.fromZIP(inputFile, ignore);
} else if (inputFile.toLowerCase().endsWith(".csv")) {
return Dataset.fromCSV(inputFile, ignore);
} else {
throw new Error("You gave one input file, but it is not a CSV file or a ZIP archive.");
}
} else {
resolvedFiles = (await readFiles(paths)).ok();
resolvedIgnoredFile = await this.setIgnoredFile(resolvedFiles, ignore);
nameCandidate = path.basename(paths[0]) + " & " + path.basename(paths[1]);
return new Dataset(nameCandidate, resolvedFiles, resolvedIgnoredFile);
}
}
}
103 changes: 4 additions & 99 deletions lib/src/lib/dolos.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,9 @@ import { Report } from "./report.js";
import { CustomOptions, Options } from "./options.js";
import { Tokenizer } from "./tokenizer/tokenizer.js";
import { Language, LanguagePicker } from "./language.js";
import { readFiles, readPath } from "./reader.js";
import { Dataset } from "./dataset.js";

import { FingerprintIndex, ExtraInfo, File, Result } from "@dodona/dolos-core";
import { csvParse, DSVRowString } from "d3-dsv";

import { constants } from "node:fs";
import fs from "node:fs/promises";
import path from "node:path";
import { spawnSync as spawn } from "node:child_process";
import { tmpdir } from "node:os";
import { FingerprintIndex, File } from "@dodona/dolos-core";

export class Dolos {
readonly options: Options;
Expand All @@ -27,97 +20,9 @@ export class Dolos {
this.options = new Options(customOptions);
}

private async fromDirectory(dirPath: string): Promise<Result<File[]>> {
const dirs = [dirPath];
const files = [];

let i = 0;

while(i < dirs.length) {
for (const entry of await fs.readdir(dirs[i], { withFileTypes: true })) {
if (entry.isDirectory()) {
dirs.push(path.join(dirs[i], entry.name));
} else if (entry.isFile()) {
files.push(readPath(path.join(dirs[i], entry.name)));
}
}
i += 1;
}

return await Result.all(files);
}

private async fromZIP(zipPath: string): Promise<Result<File[]>> {
const tmpDir = await fs.mkdtemp(path.join(tmpdir(), "dolos-unzip-"));
try {
const { status, error, stderr } = spawn("unzip", [zipPath, "-d", tmpDir]);
if (error) {
throw error;
} else if (status != 0) {
throw new Error(`Unzipping failed with exit status ${ status }, stderr: \n${stderr}`);
}
const infoPath = path.join(tmpDir, "info.csv");
if (await fs.access(infoPath, constants.R_OK).then(() => true).catch(() => false)) {
return await this.fromCSV(infoPath);
} else {
return await this.fromDirectory(tmpDir);
}
} finally {
await fs.rm(tmpDir, { recursive: true });
}
}

private async fromCSV(infoPath: string): Promise<Result<File[]>> {
const dirname = path.dirname(infoPath);
try {
const files = csvParse((await fs.readFile(infoPath)).toString())
.map((row: DSVRowString) => ({
filename: row.filename as string,
fullName: row.full_name as string,
id: row.id as string,
status: row.status as string,
submissionID: row.submission_id as string,
nameEN: row.name_en as string,
nameNL: row.name_nl as string,
exerciseID: row.exercise_id as string,
createdAt: new Date(row.created_at as string),
labels: row.label as string || row.labels as string
}))
.map((row: ExtraInfo) => readPath(path.join(dirname, row.filename), row));
return await Result.all(files);
} catch(e) {
throw new Error("The given '.csv'-file could not be opened");
}
}


public async analyzePaths(paths: string[], ignore?: string): Promise<Report> {
let files = null;
let nameCandidate = undefined;
if(paths.length == 1) {
const inputFile = paths[0];
if(inputFile.toLowerCase().endsWith(".zip")) {
files = this.fromZIP(inputFile);
nameCandidate = path.basename(inputFile, ".zip");
} else if(inputFile.toLowerCase().endsWith(".csv")) {
files = this.fromCSV(inputFile);
if (inputFile.endsWith("info.csv")) {
nameCandidate = path.dirname(inputFile).split(path.sep).pop();
}
} else {
throw new Error("You gave one input file, but is not a CSV file or a ZIP archive.");
}
} else {
files = readFiles(paths);
if (paths.length === 2) {
nameCandidate = path.basename(paths[0]) + " & " + path.basename(paths[1]);
}
}
let ignoredFile;
if (ignore) {
ignoredFile = (await readPath(ignore)).ok();
}
return this.analyze((await files).ok(), nameCandidate, ignoredFile);
const dataset = await Dataset.create(paths, ignore);
return this.analyze(dataset.files, dataset.name, dataset.ignore);
}

public async analyze(
Expand Down
27 changes: 27 additions & 0 deletions lib/src/test/dolos.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,33 @@ test("should read CSV-files", async t => {
t.true(pairs[0].similarity > 0.75);
});

test("should read CSV-files with template code", async t => {
const dolos = new Dolos();

const report = await dolos.analyzePaths(["./src/test/fixtures/javascript/info_with_template.csv"]);

const files = report.files;
t.is(5, files.length);

const boilerplate = files[0];
const unique = files[1];
const alternative = files[2];
const similar = files[3];

// Boilerplate copy should not have a match
t.is(0, report.getPair(boilerplate, unique).similarity);
t.is(0, report.getPair(boilerplate, alternative).similarity);
t.is(0, report.getPair(boilerplate, similar).similarity);


const unique_alternative = report.getPair(unique, alternative);
const unique_similar = report.getPair(unique, similar);
const alternative_similar = report.getPair(alternative, similar);
t.true(unique_alternative.similarity < alternative_similar.similarity, "Pairs with unique should be less similar");
t.true(unique_similar.similarity < alternative_similar.similarity, "Pairs with unique should be less similar");
t.true(alternative_similar.similarity > 0.5, "Pairs with similar code should have a similarity above 50%");
});

test("should read ZIP-files with info.csv", async t => {
const dolos = new Dolos();

Expand Down
7 changes: 7 additions & 0 deletions lib/src/test/fixtures/javascript/info_with_template.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
filename,id,labels,created_at,ignored
boilerplate.js,1,original,2019-07-23 17:12:33 +0200,true
boilerplate_copy.js,1,original,2019-07-23 17:12:33 +0200
implementation-unique.js,2,copy,2019-07-25 11:02:57 +0200
implementation-alternative.js,3,copy,2019-07-25 14:43:20 +0200
implementation-alternative-similar.js,4,copy,2019-07-27 19:22:39 +0200
boilerplate_copy.js,5,template,2019-07-27 19:22:39 +0200

0 comments on commit ce7ab2f

Please sign in to comment.