-
Notifications
You must be signed in to change notification settings - Fork 80
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add LLMEvaluator to JS #1186
base: main
Are you sure you want to change the base?
add LLMEvaluator to JS #1186
Changes from all commits
a70190f
d89b1b2
7e77aa7
c6ee092
1b0eae3
b22d5fb
ec3921f
b86e743
4e6cebf
9a0268b
e6f217b
e02f3df
df2c0a5
f1b6eef
334451a
6b572dc
0e3e935
821c695
5eb1e01
a7a60b8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ const entrypoints = { | |
traceable: "traceable", | ||
evaluation: "evaluation/index", | ||
"evaluation/langchain": "evaluation/langchain", | ||
"evaluation/llm": "evaluation/llm_evaluator", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Naming should match filepath So rename file below to |
||
schemas: "schemas", | ||
langchain: "langchain", | ||
vercel: "vercel", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
export { | ||
CategoricalScoreConfig, | ||
ContinuousScoreConfig, | ||
LLMEvaluator, | ||
} from "./llm_evaluator.js"; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,290 @@ | ||
// eslint-disable-next-line import/no-extraneous-dependencies | ||
import { ChatPromptTemplate } from "@langchain/core/prompts"; | ||
import * as uuid from "uuid"; | ||
import { | ||
EvaluationResult, | ||
EvaluationResults, | ||
RunEvaluator, | ||
} from "./evaluator.js"; | ||
import type { Run, Example } from "../schemas.js"; | ||
// eslint-disable-next-line import/no-extraneous-dependencies | ||
import { BaseLanguageModel } from "@langchain/core/language_models/base"; | ||
|
||
/** | ||
* Configuration for categorical (enum-based) scoring in evaluations. | ||
* Used to define discrete categories or labels for evaluation results. | ||
*/ | ||
export class CategoricalScoreConfig { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should add good typedocs :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. tried my best, lmk what needs improving |
||
/** Feedback key for the evaluator */ | ||
key: string; | ||
/** Array of valid categorical choices/labels that can be assigned */ | ||
choices: string[]; | ||
/** Description of what this score measures or represents */ | ||
description: string; | ||
/** Optional key for the LLM reasoning/explanation for the score */ | ||
reasoningKey?: string; | ||
/** Optional description of score reasoning, provided to the LLM in the structured output */ | ||
reasoningDescription?: string; | ||
|
||
/** | ||
* Creates a new categorical score configuration | ||
* @param params Configuration parameters | ||
* @param params.key Feedback key for the evaluator | ||
* @param params.choices Array of valid categorical options | ||
* @param params.description Description of the scoring criteria | ||
* @param params.reasoningKey Optional key for the LLM reasoning/explanation for the score | ||
* @param params.reasoningDescription Optional description of score reasoning, provided to the LLM in the structured output | ||
*/ | ||
constructor(params: { | ||
key: string; | ||
choices: string[]; | ||
description: string; | ||
reasoningKey?: string; | ||
reasoningDescription?: string; | ||
}) { | ||
this.key = params.key; | ||
this.choices = params.choices; | ||
this.description = params.description; | ||
this.reasoningKey = params.reasoningKey; | ||
this.reasoningDescription = params.reasoningDescription; | ||
} | ||
} | ||
|
||
/** | ||
* Configuration for continuous (numeric) scoring in evaluations. | ||
* Used to define scores that fall within a numeric range. | ||
*/ | ||
export class ContinuousScoreConfig { | ||
/** Feedback key for the evaluator */ | ||
key: string; | ||
/** Minimum allowed score value (defaults to 0) */ | ||
min: number; | ||
/** Maximum allowed score value (defaults to 1) */ | ||
max: number; | ||
/** Description of the scoring criteria */ | ||
description: string; | ||
/** Optional key for the LLM reasoning/explanation for the score */ | ||
reasoningKey?: string; | ||
/** Optional description of score reasoning, provided to the LLM in the structured output */ | ||
reasoningDescription?: string; | ||
|
||
/** | ||
* Creates a new continuous score configuration | ||
* @param params Configuration parameters | ||
* @param params.key Feedback key for the evaluator | ||
* @param params.description Description of the scoring criteria | ||
* @param params.min Optional minimum score value (defaults to 0) | ||
* @param params.max Optional maximum score value (defaults to 1) | ||
* @param params.reasoningKey Optional key for the LLM reasoning/explanation for the score | ||
* @param params.reasoningDescription Optional description of score reasoning, provided to the LLM in the structured output | ||
*/ | ||
constructor(params: { | ||
key: string; | ||
description: string; | ||
min?: number; | ||
max?: number; | ||
reasoningKey?: string; | ||
reasoningDescription?: string; | ||
}) { | ||
this.key = params.key; | ||
this.min = params.min ?? 0; | ||
this.max = params.max ?? 1; | ||
this.description = params.description; | ||
this.reasoningKey = params.reasoningKey; | ||
this.reasoningDescription = params.reasoningDescription; | ||
} | ||
} | ||
|
||
type ScoreConfig = CategoricalScoreConfig | ContinuousScoreConfig; | ||
|
||
function createScoreJsonSchema(scoreConfig: ScoreConfig): Record<string, any> { | ||
const properties: Record<string, any> = {}; | ||
|
||
if (scoreConfig.reasoningKey) { | ||
properties[scoreConfig.reasoningKey] = { | ||
type: "string", | ||
description: | ||
scoreConfig.reasoningDescription || | ||
"First, think step by step to explain your score.", | ||
}; | ||
} | ||
|
||
if ("choices" in scoreConfig) { | ||
properties.value = { | ||
type: "string", | ||
enum: scoreConfig.choices, | ||
description: `The score for the evaluation, one of ${scoreConfig.choices.join( | ||
", " | ||
)}.`, | ||
}; | ||
} else { | ||
properties.score = { | ||
type: "number", | ||
minimum: scoreConfig.min, | ||
maximum: scoreConfig.max, | ||
description: `The score for the evaluation, between ${scoreConfig.min} and ${scoreConfig.max}, inclusive.`, | ||
}; | ||
} | ||
|
||
return { | ||
title: scoreConfig.key, | ||
description: scoreConfig.description, | ||
type: "object", | ||
properties, | ||
required: scoreConfig.reasoningKey | ||
? ["choices" in scoreConfig ? "value" : "score", scoreConfig.reasoningKey] | ||
: ["choices" in scoreConfig ? "value" : "score"], | ||
}; | ||
} | ||
|
||
interface LLMEvaluatorParams { | ||
promptTemplate: string | [string, string][]; | ||
scoreConfig: ScoreConfig; | ||
chatModel: BaseLanguageModel; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should be |
||
mapVariables?: (run: Run, example?: Example) => Record<string, any>; | ||
} | ||
|
||
export class LLMEvaluator implements RunEvaluator { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No docstring? |
||
prompt: any; | ||
mapVariables?: (run: Run, example?: Example) => Record<string, any>; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are mapVariables? |
||
scoreConfig: ScoreConfig; | ||
scoreSchema: Record<string, any>; | ||
runnable: any; | ||
|
||
constructor() {} | ||
|
||
static async create(params: LLMEvaluatorParams): Promise<LLMEvaluator> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You definitely shouldn't have two methods for this, ideally you just use the constructor |
||
const evaluator = new LLMEvaluator(); | ||
await evaluator.initialize( | ||
params.promptTemplate, | ||
params.scoreConfig, | ||
params.chatModel, | ||
params.mapVariables | ||
); | ||
return evaluator; | ||
} | ||
private async initialize( | ||
promptTemplate: string | [string, string][], | ||
scoreConfig: ScoreConfig, | ||
chatModel: BaseLanguageModel, | ||
mapVariables?: (run: Run, example?: Example) => Record<string, any> | ||
) { | ||
try { | ||
// Store the configuration | ||
this.scoreConfig = scoreConfig; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. More conventional to make this a constructor instead of a factory method |
||
this.mapVariables = mapVariables; | ||
|
||
// Create the score schema | ||
this.scoreSchema = createScoreJsonSchema(scoreConfig); | ||
|
||
// Create the prompt template | ||
if (typeof promptTemplate === "string") { | ||
this.prompt = ChatPromptTemplate.fromMessages([ | ||
{ role: "human", content: promptTemplate }, | ||
]); | ||
} else { | ||
this.prompt = ChatPromptTemplate.fromMessages(promptTemplate); | ||
} | ||
|
||
const modelWithStructuredOutput = chatModel.withStructuredOutput | ||
? chatModel.withStructuredOutput(this.scoreSchema) | ||
: null; | ||
if (!modelWithStructuredOutput) { | ||
throw new Error("Passed chat model must support structured output"); | ||
} | ||
this.runnable = this.prompt.pipe(modelWithStructuredOutput); | ||
} catch (e: unknown) { | ||
throw new Error( | ||
`Failed to initialize LLMEvaluator: ${(e as Error).message}` | ||
); | ||
} | ||
} | ||
|
||
async evaluateRun( | ||
run: Run, | ||
example?: Example | ||
): Promise<EvaluationResult | EvaluationResults> { | ||
const runId = uuid.v4(); | ||
const variables = this.prepareVariables(run, example); | ||
const output = await this.runnable.invoke(variables, { runId: runId }); | ||
|
||
return this.parseOutput(output, runId); | ||
} | ||
|
||
private prepareVariables(run: Run, example?: Example): Record<string, any> { | ||
if (this.mapVariables) { | ||
return this.mapVariables(run, example); | ||
} | ||
|
||
const variables: Record<string, any> = {}; | ||
|
||
// Input handling | ||
if (Object.keys(run.inputs).length === 0) { | ||
throw new Error( | ||
"No input keys are present in run.inputs but the prompt requires 'input'." | ||
); | ||
} | ||
if (Object.keys(run.inputs).length !== 1) { | ||
throw new Error( | ||
"Multiple input keys are present in run.inputs. Please provide a map_variables function." | ||
); | ||
} | ||
variables.input = Object.values(run.inputs)[0]; | ||
|
||
// Output handling | ||
if (!run.outputs || Object.keys(run.outputs).length === 0) { | ||
throw new Error( | ||
"No output keys are present in run.outputs but the prompt requires 'output'." | ||
); | ||
} | ||
if (Object.keys(run.outputs).length !== 1) { | ||
throw new Error( | ||
"Multiple output keys are present in run.outputs. Please provide a map_variables function." | ||
); | ||
} | ||
variables.output = Object.values(run.outputs)[0]; | ||
|
||
// Expected output handling | ||
if (example?.outputs) { | ||
if (Object.keys(example.outputs).length === 0) { | ||
throw new Error( | ||
"No output keys are present in example.outputs but the prompt requires 'expected'." | ||
); | ||
} | ||
if (Object.keys(example.outputs).length !== 1) { | ||
throw new Error( | ||
"Multiple output keys are present in example.outputs. Please provide a map_variables function." | ||
); | ||
} | ||
variables.expected = Object.values(example.outputs)[0]; | ||
} | ||
|
||
return variables; | ||
} | ||
|
||
private parseOutput( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Prefer protected instead of private to make subclassing possible |
||
output: Record<string, any>, | ||
runId: string | ||
): EvaluationResult { | ||
const explanation = this.scoreConfig.reasoningKey | ||
? output[this.scoreConfig.reasoningKey] | ||
: undefined; | ||
if ("choices" in this.scoreConfig) { | ||
const value = output.value; | ||
return { | ||
key: this.scoreConfig.key, | ||
value, | ||
comment: explanation, | ||
sourceRunId: runId, | ||
}; | ||
} else { | ||
const score = output.score; | ||
return { | ||
key: this.scoreConfig.key, | ||
score, | ||
comment: explanation, | ||
sourceRunId: runId, | ||
}; | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good to keep these sorted and avoid touching unnecessary lines