diff --git a/js/.gitignore b/js/.gitignore index 4b11d6959..b2563a1d4 100644 --- a/js/.gitignore +++ b/js/.gitignore @@ -51,6 +51,10 @@ Chinook_Sqlite.sql /evaluation/langchain.js /evaluation/langchain.d.ts /evaluation/langchain.d.cts +/evaluation/llm.cjs +/evaluation/llm.js +/evaluation/llm.d.ts +/evaluation/llm.d.cts /schemas.cjs /schemas.js /schemas.d.ts diff --git a/js/package.json b/js/package.json index cf7d3acfa..f4269a838 100644 --- a/js/package.json +++ b/js/package.json @@ -25,6 +25,10 @@ "evaluation/langchain.js", "evaluation/langchain.d.ts", "evaluation/langchain.d.cts", + "evaluation/llm.cjs", + "evaluation/llm.js", + "evaluation/llm.d.ts", + "evaluation/llm.d.cts", "schemas.cjs", "schemas.js", "schemas.d.ts", @@ -113,7 +117,6 @@ "@babel/preset-env": "^7.22.4", "@faker-js/faker": "^8.4.1", "@jest/globals": "^29.5.0", - "@langchain/core": "^0.3.14", "@langchain/langgraph": "^0.2.18", "@langchain/openai": "^0.3.11", "@opentelemetry/sdk-trace-base": "^1.26.0", @@ -132,13 +135,14 @@ "eslint-plugin-no-instanceof": "^1.0.1", "eslint-plugin-prettier": "^4.2.1", "jest": "^29.5.0", - "langchain": "^0.3.3", "openai": "^4.67.3", "prettier": "^2.8.8", "ts-jest": "^29.1.0", "ts-node": "^10.9.1", "typescript": "^5.4.5", - "zod": "^3.23.8" + "zod": "^3.23.8", + "@langchain/core": "^0.3.14", + "langchain": "^0.3.3" }, "peerDependencies": { "openai": "*" @@ -209,6 +213,15 @@ "import": "./evaluation/langchain.js", "require": "./evaluation/langchain.cjs" }, + "./evaluation/llm": { + "types": { + "import": "./evaluation/llm.d.ts", + "require": "./evaluation/llm.d.cts", + "default": "./evaluation/llm.d.ts" + }, + "import": "./evaluation/llm.js", + "require": "./evaluation/llm.cjs" + }, "./schemas": { "types": { "import": "./schemas.d.ts", diff --git a/js/scripts/create-entrypoints.js b/js/scripts/create-entrypoints.js index 9cce2ab22..b5bcb545f 100644 --- a/js/scripts/create-entrypoints.js +++ b/js/scripts/create-entrypoints.js @@ -12,6 +12,7 @@ const entrypoints = { traceable: "traceable", evaluation: "evaluation/index", "evaluation/langchain": "evaluation/langchain", + "evaluation/llm": "evaluation/llm_evaluator", schemas: "schemas", langchain: "langchain", vercel: "vercel", diff --git a/js/src/evaluation/llm.ts b/js/src/evaluation/llm.ts new file mode 100644 index 000000000..3f8c4cacd --- /dev/null +++ b/js/src/evaluation/llm.ts @@ -0,0 +1,5 @@ +export { + CategoricalScoreConfig, + ContinuousScoreConfig, + LLMEvaluator, +} from "./llm_evaluator.js"; diff --git a/js/src/evaluation/llm_evaluator.ts b/js/src/evaluation/llm_evaluator.ts new file mode 100644 index 000000000..3f8bd2f4e --- /dev/null +++ b/js/src/evaluation/llm_evaluator.ts @@ -0,0 +1,290 @@ +// eslint-disable-next-line import/no-extraneous-dependencies +import { ChatPromptTemplate } from "@langchain/core/prompts"; +import * as uuid from "uuid"; +import { + EvaluationResult, + EvaluationResults, + RunEvaluator, +} from "./evaluator.js"; +import type { Run, Example } from "../schemas.js"; +// eslint-disable-next-line import/no-extraneous-dependencies +import { BaseLanguageModel } from "@langchain/core/language_models/base"; + +/** + * Configuration for categorical (enum-based) scoring in evaluations. + * Used to define discrete categories or labels for evaluation results. + */ +export class CategoricalScoreConfig { + /** Feedback key for the evaluator */ + key: string; + /** Array of valid categorical choices/labels that can be assigned */ + choices: string[]; + /** Description of what this score measures or represents */ + description: string; + /** Optional key for the LLM reasoning/explanation for the score */ + reasoningKey?: string; + /** Optional description of score reasoning, provided to the LLM in the structured output */ + reasoningDescription?: string; + + /** + * Creates a new categorical score configuration + * @param params Configuration parameters + * @param params.key Feedback key for the evaluator + * @param params.choices Array of valid categorical options + * @param params.description Description of the scoring criteria + * @param params.reasoningKey Optional key for the LLM reasoning/explanation for the score + * @param params.reasoningDescription Optional description of score reasoning, provided to the LLM in the structured output + */ + constructor(params: { + key: string; + choices: string[]; + description: string; + reasoningKey?: string; + reasoningDescription?: string; + }) { + this.key = params.key; + this.choices = params.choices; + this.description = params.description; + this.reasoningKey = params.reasoningKey; + this.reasoningDescription = params.reasoningDescription; + } +} + +/** + * Configuration for continuous (numeric) scoring in evaluations. + * Used to define scores that fall within a numeric range. + */ +export class ContinuousScoreConfig { + /** Feedback key for the evaluator */ + key: string; + /** Minimum allowed score value (defaults to 0) */ + min: number; + /** Maximum allowed score value (defaults to 1) */ + max: number; + /** Description of the scoring criteria */ + description: string; + /** Optional key for the LLM reasoning/explanation for the score */ + reasoningKey?: string; + /** Optional description of score reasoning, provided to the LLM in the structured output */ + reasoningDescription?: string; + + /** + * Creates a new continuous score configuration + * @param params Configuration parameters + * @param params.key Feedback key for the evaluator + * @param params.description Description of the scoring criteria + * @param params.min Optional minimum score value (defaults to 0) + * @param params.max Optional maximum score value (defaults to 1) + * @param params.reasoningKey Optional key for the LLM reasoning/explanation for the score + * @param params.reasoningDescription Optional description of score reasoning, provided to the LLM in the structured output + */ + constructor(params: { + key: string; + description: string; + min?: number; + max?: number; + reasoningKey?: string; + reasoningDescription?: string; + }) { + this.key = params.key; + this.min = params.min ?? 0; + this.max = params.max ?? 1; + this.description = params.description; + this.reasoningKey = params.reasoningKey; + this.reasoningDescription = params.reasoningDescription; + } +} + +type ScoreConfig = CategoricalScoreConfig | ContinuousScoreConfig; + +function createScoreJsonSchema(scoreConfig: ScoreConfig): Record { + const properties: Record = {}; + + if (scoreConfig.reasoningKey) { + properties[scoreConfig.reasoningKey] = { + type: "string", + description: + scoreConfig.reasoningDescription || + "First, think step by step to explain your score.", + }; + } + + if ("choices" in scoreConfig) { + properties.value = { + type: "string", + enum: scoreConfig.choices, + description: `The score for the evaluation, one of ${scoreConfig.choices.join( + ", " + )}.`, + }; + } else { + properties.score = { + type: "number", + minimum: scoreConfig.min, + maximum: scoreConfig.max, + description: `The score for the evaluation, between ${scoreConfig.min} and ${scoreConfig.max}, inclusive.`, + }; + } + + return { + title: scoreConfig.key, + description: scoreConfig.description, + type: "object", + properties, + required: scoreConfig.reasoningKey + ? ["choices" in scoreConfig ? "value" : "score", scoreConfig.reasoningKey] + : ["choices" in scoreConfig ? "value" : "score"], + }; +} + +interface LLMEvaluatorParams { + promptTemplate: string | [string, string][]; + scoreConfig: ScoreConfig; + chatModel: BaseLanguageModel; + mapVariables?: (run: Run, example?: Example) => Record; +} + +export class LLMEvaluator implements RunEvaluator { + prompt: any; + mapVariables?: (run: Run, example?: Example) => Record; + scoreConfig: ScoreConfig; + scoreSchema: Record; + runnable: any; + + constructor() {} + + static async create(params: LLMEvaluatorParams): Promise { + const evaluator = new LLMEvaluator(); + await evaluator.initialize( + params.promptTemplate, + params.scoreConfig, + params.chatModel, + params.mapVariables + ); + return evaluator; + } + private async initialize( + promptTemplate: string | [string, string][], + scoreConfig: ScoreConfig, + chatModel: BaseLanguageModel, + mapVariables?: (run: Run, example?: Example) => Record + ) { + try { + // Store the configuration + this.scoreConfig = scoreConfig; + this.mapVariables = mapVariables; + + // Create the score schema + this.scoreSchema = createScoreJsonSchema(scoreConfig); + + // Create the prompt template + if (typeof promptTemplate === "string") { + this.prompt = ChatPromptTemplate.fromMessages([ + { role: "human", content: promptTemplate }, + ]); + } else { + this.prompt = ChatPromptTemplate.fromMessages(promptTemplate); + } + + const modelWithStructuredOutput = chatModel.withStructuredOutput + ? chatModel.withStructuredOutput(this.scoreSchema) + : null; + if (!modelWithStructuredOutput) { + throw new Error("Passed chat model must support structured output"); + } + this.runnable = this.prompt.pipe(modelWithStructuredOutput); + } catch (e: unknown) { + throw new Error( + `Failed to initialize LLMEvaluator: ${(e as Error).message}` + ); + } + } + + async evaluateRun( + run: Run, + example?: Example + ): Promise { + const runId = uuid.v4(); + const variables = this.prepareVariables(run, example); + const output = await this.runnable.invoke(variables, { runId: runId }); + + return this.parseOutput(output, runId); + } + + private prepareVariables(run: Run, example?: Example): Record { + if (this.mapVariables) { + return this.mapVariables(run, example); + } + + const variables: Record = {}; + + // Input handling + if (Object.keys(run.inputs).length === 0) { + throw new Error( + "No input keys are present in run.inputs but the prompt requires 'input'." + ); + } + if (Object.keys(run.inputs).length !== 1) { + throw new Error( + "Multiple input keys are present in run.inputs. Please provide a map_variables function." + ); + } + variables.input = Object.values(run.inputs)[0]; + + // Output handling + if (!run.outputs || Object.keys(run.outputs).length === 0) { + throw new Error( + "No output keys are present in run.outputs but the prompt requires 'output'." + ); + } + if (Object.keys(run.outputs).length !== 1) { + throw new Error( + "Multiple output keys are present in run.outputs. Please provide a map_variables function." + ); + } + variables.output = Object.values(run.outputs)[0]; + + // Expected output handling + if (example?.outputs) { + if (Object.keys(example.outputs).length === 0) { + throw new Error( + "No output keys are present in example.outputs but the prompt requires 'expected'." + ); + } + if (Object.keys(example.outputs).length !== 1) { + throw new Error( + "Multiple output keys are present in example.outputs. Please provide a map_variables function." + ); + } + variables.expected = Object.values(example.outputs)[0]; + } + + return variables; + } + + private parseOutput( + output: Record, + runId: string + ): EvaluationResult { + const explanation = this.scoreConfig.reasoningKey + ? output[this.scoreConfig.reasoningKey] + : undefined; + if ("choices" in this.scoreConfig) { + const value = output.value; + return { + key: this.scoreConfig.key, + value, + comment: explanation, + sourceRunId: runId, + }; + } else { + const score = output.score; + return { + key: this.scoreConfig.key, + score, + comment: explanation, + sourceRunId: runId, + }; + } + } +} diff --git a/js/src/tests/llm_evaluator.int.test.ts b/js/src/tests/llm_evaluator.int.test.ts new file mode 100644 index 000000000..c5762a60d --- /dev/null +++ b/js/src/tests/llm_evaluator.int.test.ts @@ -0,0 +1,183 @@ +import { expect, test } from "@jest/globals"; +import { Client } from "../index.js"; +import { + CategoricalScoreConfig, + ContinuousScoreConfig, + LLMEvaluator, +} from "../evaluation/llm.js"; +import { evaluate } from "../evaluation/_runner.js"; +import { ChatOpenAI } from "@langchain/openai"; + +const CHAT_MODEL = new ChatOpenAI({ model: "gpt-4" }); + +const TESTING_DATASET_NAME = "LLMEvaluator dataset"; + +test("llm evaluator initialization with categorical config", async () => { + const evaluator = await LLMEvaluator.create({ + promptTemplate: "Is the response vague? Y/N\n{input}", + scoreConfig: new CategoricalScoreConfig({ + key: "vagueness", + choices: ["Y", "N"], + description: "Whether the response is vague. Y for yes, N for no.", + reasoningKey: "explanation", + }), + chatModel: CHAT_MODEL, + }); + + expect(evaluator).toBeDefined(); + // Check input variables extracted from template + expect(evaluator.prompt.inputVariables).toEqual(["input"]); + // Verify JSON schema for categorical scoring + expect(evaluator.scoreSchema).toEqual({ + type: "object", + description: "Whether the response is vague. Y for yes, N for no.", + title: "vagueness", + properties: { + value: { + type: "string", + enum: ["Y", "N"], + description: "The score for the evaluation, one of Y, N.", + }, + explanation: { + type: "string", + description: "First, think step by step to explain your score.", + }, + }, + required: ["value", "explanation"], + }); + + expect((evaluator.scoreConfig as CategoricalScoreConfig).choices).toEqual([ + "Y", + "N", + ]); +}); + +test("llm evaluator initialization with continuous config", async () => { + const evaluator = await LLMEvaluator.create({ + promptTemplate: "Rate the response from 0 to 1.\n{input}", + scoreConfig: new ContinuousScoreConfig({ + key: "rating", + description: "The rating of the response, from 0 to 1.", + min: 0, + max: 1, + }), + chatModel: CHAT_MODEL, + }); + + expect(evaluator).toBeDefined(); + // Check input variables extracted from template + expect(evaluator.prompt.inputVariables).toEqual(["input"]); + // Verify JSON schema for continuous scoring + expect(evaluator.scoreSchema).toEqual({ + type: "object", + title: "rating", + description: "The rating of the response, from 0 to 1.", + properties: { + score: { + type: "number", + minimum: 0, + maximum: 1, + description: + "The score for the evaluation, between 0 and 1, inclusive.", + }, + }, + required: ["score"], + }); + // Verify score config properties + expect(evaluator.scoreConfig.key).toBe("rating"); + expect((evaluator.scoreConfig as ContinuousScoreConfig).min).toBe(0); + expect((evaluator.scoreConfig as ContinuousScoreConfig).max).toBe(1); +}); + +test("llm evaluator with custom variable mapping", async () => { + const evaluator = await LLMEvaluator.create({ + promptTemplate: [ + [ + "system", + "Is the output accurate with respect to the context and question? Y/N", + ], + ["human", "Context: {context}\nQuestion: {question}\nOutput: {output}"], + ], + scoreConfig: new CategoricalScoreConfig({ + key: "accuracy", + choices: ["Y", "N"], + description: + "Whether the output is accurate with respect to the context and question.", + reasoningKey: "explanation", + reasoningDescription: "First, think step by step to explain your score.", + }), + chatModel: CHAT_MODEL, + mapVariables: (run: any, example?: any) => ({ + context: example?.inputs?.context || "", + question: example?.inputs?.question || "", + output: run.outputs?.output || "", + }), + }); + + expect(evaluator).toBeDefined(); +}); + +test("llm evaluator can evaluate runs", async () => { + const client = new Client(); + await client.clonePublicDataset( + "https://beta.smith.langchain.com/public/06785303-0f70-4466-b637-f23d38c0f28e/d", + { + datasetName: TESTING_DATASET_NAME, + } + ); + const evaluator = await LLMEvaluator.create({ + promptTemplate: "Is the response vague? Y/N\n{response}", + scoreConfig: new CategoricalScoreConfig({ + key: "vagueness", + choices: ["Y", "N"], + description: "Whether the response is vague. Y for yes, N for no.", + reasoningKey: "explanation", + reasoningDescription: "First, think step by step to explain your score.", + }), + chatModel: CHAT_MODEL, + mapVariables: (run: any, _example?: any) => ({ + response: run.outputs?.["output"] ?? "", + }), + }); + + const targetFunc = (input: Record) => { + return { output: input.question + " This is a test response" }; + }; + + const evalRes = await evaluate(targetFunc, { + data: TESTING_DATASET_NAME, + evaluators: [evaluator], + description: "LLM evaluator test run", + }); + + expect(evalRes.results).toHaveLength(10); + const firstResult = evalRes.results[0]; + + const evaluation = firstResult.evaluationResults.results[0]; + expect(evaluation.key).toBe("vagueness"); + expect(["Y", "N"]).toContain(evaluation.value); + expect(evaluation.comment).toBeDefined(); + + await client.deleteDataset({ datasetName: TESTING_DATASET_NAME }); +}); + +test("llm evaluator with multiple prompt messages", async () => { + const evaluator = await LLMEvaluator.create({ + promptTemplate: [ + ["system", "You are a helpful assistant evaluating responses."], + ["human", "Rate this response from 0 to 1: {response}"], + ], + scoreConfig: new ContinuousScoreConfig({ + key: "rating", + description: "Quality rating from 0 to 1", + min: 0, + max: 1, + }), + chatModel: CHAT_MODEL, + mapVariables: (run: any, _example?: any) => ({ + response: run.outputs?.["output"] ?? "", + }), + }); + + expect(evaluator).toBeDefined(); +}); diff --git a/js/tsconfig.json b/js/tsconfig.json index b778ed83f..5d3d8b2c5 100644 --- a/js/tsconfig.json +++ b/js/tsconfig.json @@ -38,6 +38,7 @@ "src/traceable.ts", "src/evaluation/index.ts", "src/evaluation/langchain.ts", + "src/evaluation/llm_evaluator.ts", "src/schemas.ts", "src/langchain.ts", "src/vercel.ts",