Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add LLMEvaluator to JS #1186

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
4 changes: 4 additions & 0 deletions js/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ Chinook_Sqlite.sql
/evaluation/langchain.js
/evaluation/langchain.d.ts
/evaluation/langchain.d.cts
/evaluation/llm.cjs
/evaluation/llm.js
/evaluation/llm.d.ts
/evaluation/llm.d.cts
/schemas.cjs
/schemas.js
/schemas.d.ts
Expand Down
19 changes: 16 additions & 3 deletions js/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@
"evaluation/langchain.js",
"evaluation/langchain.d.ts",
"evaluation/langchain.d.cts",
"evaluation/llm.cjs",
"evaluation/llm.js",
"evaluation/llm.d.ts",
"evaluation/llm.d.cts",
"schemas.cjs",
"schemas.js",
"schemas.d.ts",
Expand Down Expand Up @@ -113,7 +117,6 @@
"@babel/preset-env": "^7.22.4",
"@faker-js/faker": "^8.4.1",
"@jest/globals": "^29.5.0",
"@langchain/core": "^0.3.14",
"@langchain/langgraph": "^0.2.18",
"@langchain/openai": "^0.3.11",
"@opentelemetry/sdk-trace-base": "^1.26.0",
Expand All @@ -132,13 +135,14 @@
"eslint-plugin-no-instanceof": "^1.0.1",
"eslint-plugin-prettier": "^4.2.1",
"jest": "^29.5.0",
"langchain": "^0.3.3",
"openai": "^4.67.3",
"prettier": "^2.8.8",
"ts-jest": "^29.1.0",
"ts-node": "^10.9.1",
"typescript": "^5.4.5",
"zod": "^3.23.8"
"zod": "^3.23.8",
"@langchain/core": "^0.3.14",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good to keep these sorted and avoid touching unnecessary lines

"langchain": "^0.3.3"
},
"peerDependencies": {
"openai": "*"
Expand Down Expand Up @@ -209,6 +213,15 @@
"import": "./evaluation/langchain.js",
"require": "./evaluation/langchain.cjs"
},
"./evaluation/llm": {
"types": {
"import": "./evaluation/llm.d.ts",
"require": "./evaluation/llm.d.cts",
"default": "./evaluation/llm.d.ts"
},
"import": "./evaluation/llm.js",
"require": "./evaluation/llm.cjs"
},
"./schemas": {
"types": {
"import": "./schemas.d.ts",
Expand Down
1 change: 1 addition & 0 deletions js/scripts/create-entrypoints.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ const entrypoints = {
traceable: "traceable",
evaluation: "evaluation/index",
"evaluation/langchain": "evaluation/langchain",
"evaluation/llm": "evaluation/llm_evaluator",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Naming should match filepath

So rename file below to evaluation/llm.ts

schemas: "schemas",
langchain: "langchain",
vercel: "vercel",
Expand Down
5 changes: 5 additions & 0 deletions js/src/evaluation/llm.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
export {
CategoricalScoreConfig,
ContinuousScoreConfig,
LLMEvaluator,
} from "./llm_evaluator.js";
290 changes: 290 additions & 0 deletions js/src/evaluation/llm_evaluator.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
// eslint-disable-next-line import/no-extraneous-dependencies
import { ChatPromptTemplate } from "@langchain/core/prompts";
import * as uuid from "uuid";
import {
EvaluationResult,
EvaluationResults,
RunEvaluator,
} from "./evaluator.js";
import type { Run, Example } from "../schemas.js";
// eslint-disable-next-line import/no-extraneous-dependencies
import { BaseLanguageModel } from "@langchain/core/language_models/base";

/**
* Configuration for categorical (enum-based) scoring in evaluations.
* Used to define discrete categories or labels for evaluation results.
*/
export class CategoricalScoreConfig {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add good typedocs :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tried my best, lmk what needs improving

/** Feedback key for the evaluator */
key: string;
/** Array of valid categorical choices/labels that can be assigned */
choices: string[];
/** Description of what this score measures or represents */
description: string;
/** Optional key for the LLM reasoning/explanation for the score */
reasoningKey?: string;
/** Optional description of score reasoning, provided to the LLM in the structured output */
reasoningDescription?: string;

/**
* Creates a new categorical score configuration
* @param params Configuration parameters
* @param params.key Feedback key for the evaluator
* @param params.choices Array of valid categorical options
* @param params.description Description of the scoring criteria
* @param params.reasoningKey Optional key for the LLM reasoning/explanation for the score
* @param params.reasoningDescription Optional description of score reasoning, provided to the LLM in the structured output
*/
constructor(params: {
key: string;
choices: string[];
description: string;
reasoningKey?: string;
reasoningDescription?: string;
}) {
this.key = params.key;
this.choices = params.choices;
this.description = params.description;
this.reasoningKey = params.reasoningKey;
this.reasoningDescription = params.reasoningDescription;
}
}

/**
* Configuration for continuous (numeric) scoring in evaluations.
* Used to define scores that fall within a numeric range.
*/
export class ContinuousScoreConfig {
/** Feedback key for the evaluator */
key: string;
/** Minimum allowed score value (defaults to 0) */
min: number;
/** Maximum allowed score value (defaults to 1) */
max: number;
/** Description of the scoring criteria */
description: string;
/** Optional key for the LLM reasoning/explanation for the score */
reasoningKey?: string;
/** Optional description of score reasoning, provided to the LLM in the structured output */
reasoningDescription?: string;

/**
* Creates a new continuous score configuration
* @param params Configuration parameters
* @param params.key Feedback key for the evaluator
* @param params.description Description of the scoring criteria
* @param params.min Optional minimum score value (defaults to 0)
* @param params.max Optional maximum score value (defaults to 1)
* @param params.reasoningKey Optional key for the LLM reasoning/explanation for the score
* @param params.reasoningDescription Optional description of score reasoning, provided to the LLM in the structured output
*/
constructor(params: {
key: string;
description: string;
min?: number;
max?: number;
reasoningKey?: string;
reasoningDescription?: string;
}) {
this.key = params.key;
this.min = params.min ?? 0;
this.max = params.max ?? 1;
this.description = params.description;
this.reasoningKey = params.reasoningKey;
this.reasoningDescription = params.reasoningDescription;
}
}

type ScoreConfig = CategoricalScoreConfig | ContinuousScoreConfig;

function createScoreJsonSchema(scoreConfig: ScoreConfig): Record<string, any> {
const properties: Record<string, any> = {};

if (scoreConfig.reasoningKey) {
properties[scoreConfig.reasoningKey] = {
type: "string",
description:
scoreConfig.reasoningDescription ||
"First, think step by step to explain your score.",
};
}

if ("choices" in scoreConfig) {
properties.value = {
type: "string",
enum: scoreConfig.choices,
description: `The score for the evaluation, one of ${scoreConfig.choices.join(
", "
)}.`,
};
} else {
properties.score = {
type: "number",
minimum: scoreConfig.min,
maximum: scoreConfig.max,
description: `The score for the evaluation, between ${scoreConfig.min} and ${scoreConfig.max}, inclusive.`,
};
}

return {
title: scoreConfig.key,
description: scoreConfig.description,
type: "object",
properties,
required: scoreConfig.reasoningKey
? ["choices" in scoreConfig ? "value" : "score", scoreConfig.reasoningKey]
: ["choices" in scoreConfig ? "value" : "score"],
};
}

interface LLMEvaluatorParams {
promptTemplate: string | [string, string][];
scoreConfig: ScoreConfig;
chatModel: BaseLanguageModel;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be BaseChatModel

mapVariables?: (run: Run, example?: Example) => Record<string, any>;
}

export class LLMEvaluator implements RunEvaluator {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No docstring?

prompt: any;
mapVariables?: (run: Run, example?: Example) => Record<string, any>;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are mapVariables?

scoreConfig: ScoreConfig;
scoreSchema: Record<string, any>;
runnable: any;

constructor() {}

static async create(params: LLMEvaluatorParams): Promise<LLMEvaluator> {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You definitely shouldn't have two methods for this, ideally you just use the constructor

const evaluator = new LLMEvaluator();
await evaluator.initialize(
params.promptTemplate,
params.scoreConfig,
params.chatModel,
params.mapVariables
);
return evaluator;
}
private async initialize(
promptTemplate: string | [string, string][],
scoreConfig: ScoreConfig,
chatModel: BaseLanguageModel,
mapVariables?: (run: Run, example?: Example) => Record<string, any>
) {
try {
// Store the configuration
this.scoreConfig = scoreConfig;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More conventional to make this a constructor instead of a factory method

this.mapVariables = mapVariables;

// Create the score schema
this.scoreSchema = createScoreJsonSchema(scoreConfig);

// Create the prompt template
if (typeof promptTemplate === "string") {
this.prompt = ChatPromptTemplate.fromMessages([
{ role: "human", content: promptTemplate },
]);
} else {
this.prompt = ChatPromptTemplate.fromMessages(promptTemplate);
}

const modelWithStructuredOutput = chatModel.withStructuredOutput
? chatModel.withStructuredOutput(this.scoreSchema)
: null;
if (!modelWithStructuredOutput) {
throw new Error("Passed chat model must support structured output");
}
this.runnable = this.prompt.pipe(modelWithStructuredOutput);
} catch (e: unknown) {
throw new Error(
`Failed to initialize LLMEvaluator: ${(e as Error).message}`
);
}
}

async evaluateRun(
run: Run,
example?: Example
): Promise<EvaluationResult | EvaluationResults> {
const runId = uuid.v4();
const variables = this.prepareVariables(run, example);
const output = await this.runnable.invoke(variables, { runId: runId });

return this.parseOutput(output, runId);
}

private prepareVariables(run: Run, example?: Example): Record<string, any> {
if (this.mapVariables) {
return this.mapVariables(run, example);
}

const variables: Record<string, any> = {};

// Input handling
if (Object.keys(run.inputs).length === 0) {
throw new Error(
"No input keys are present in run.inputs but the prompt requires 'input'."
);
}
if (Object.keys(run.inputs).length !== 1) {
throw new Error(
"Multiple input keys are present in run.inputs. Please provide a map_variables function."
);
}
variables.input = Object.values(run.inputs)[0];

// Output handling
if (!run.outputs || Object.keys(run.outputs).length === 0) {
throw new Error(
"No output keys are present in run.outputs but the prompt requires 'output'."
);
}
if (Object.keys(run.outputs).length !== 1) {
throw new Error(
"Multiple output keys are present in run.outputs. Please provide a map_variables function."
);
}
variables.output = Object.values(run.outputs)[0];

// Expected output handling
if (example?.outputs) {
if (Object.keys(example.outputs).length === 0) {
throw new Error(
"No output keys are present in example.outputs but the prompt requires 'expected'."
);
}
if (Object.keys(example.outputs).length !== 1) {
throw new Error(
"Multiple output keys are present in example.outputs. Please provide a map_variables function."
);
}
variables.expected = Object.values(example.outputs)[0];
}

return variables;
}

private parseOutput(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Prefer protected instead of private to make subclassing possible

output: Record<string, any>,
runId: string
): EvaluationResult {
const explanation = this.scoreConfig.reasoningKey
? output[this.scoreConfig.reasoningKey]
: undefined;
if ("choices" in this.scoreConfig) {
const value = output.value;
return {
key: this.scoreConfig.key,
value,
comment: explanation,
sourceRunId: runId,
};
} else {
const score = output.score;
return {
key: this.scoreConfig.key,
score,
comment: explanation,
sourceRunId: runId,
};
}
}
}
Loading
Loading