langchain-ai · isahers1 · Nov 7, 2024 · Nov 7, 2024 · Nov 7, 2024 · Nov 7, 2024
diff --git a/js/.gitignore b/js/.gitignore
@@ -51,6 +51,10 @@ Chinook_Sqlite.sql
 /evaluation/langchain.js
 /evaluation/langchain.d.ts
 /evaluation/langchain.d.cts
+/evaluation/llm.cjs
+/evaluation/llm.js
+/evaluation/llm.d.ts
+/evaluation/llm.d.cts
 /schemas.cjs
 /schemas.js
 /schemas.d.ts

diff --git a/js/package.json b/js/package.json
@@ -25,6 +25,10 @@
     "evaluation/langchain.js",
     "evaluation/langchain.d.ts",
     "evaluation/langchain.d.cts",
+    "evaluation/llm.cjs",
+    "evaluation/llm.js",
+    "evaluation/llm.d.ts",
+    "evaluation/llm.d.cts",
     "schemas.cjs",
     "schemas.js",
     "schemas.d.ts",
@@ -113,7 +117,6 @@
     "@babel/preset-env": "^7.22.4",
     "@faker-js/faker": "^8.4.1",
     "@jest/globals": "^29.5.0",
-    "@langchain/core": "^0.3.14",
     "@langchain/langgraph": "^0.2.18",
     "@langchain/openai": "^0.3.11",
     "@opentelemetry/sdk-trace-base": "^1.26.0",
@@ -132,13 +135,14 @@
     "eslint-plugin-no-instanceof": "^1.0.1",
     "eslint-plugin-prettier": "^4.2.1",
     "jest": "^29.5.0",
-    "langchain": "^0.3.3",
     "openai": "^4.67.3",
     "prettier": "^2.8.8",
     "ts-jest": "^29.1.0",
     "ts-node": "^10.9.1",
     "typescript": "^5.4.5",
-    "zod": "^3.23.8"
+    "zod": "^3.23.8",
+    "@langchain/core": "^0.3.14",
+    "langchain": "^0.3.3"
   },
   "peerDependencies": {
     "openai": "*"
@@ -209,6 +213,15 @@
       "import": "./evaluation/langchain.js",
       "require": "./evaluation/langchain.cjs"
     },
+    "./evaluation/llm": {
+      "types": {
+        "import": "./evaluation/llm.d.ts",
+        "require": "./evaluation/llm.d.cts",
+        "default": "./evaluation/llm.d.ts"
+      },
+      "import": "./evaluation/llm.js",
+      "require": "./evaluation/llm.cjs"
+    },
     "./schemas": {
       "types": {
         "import": "./schemas.d.ts",

diff --git a/js/scripts/create-entrypoints.js b/js/scripts/create-entrypoints.js
@@ -12,6 +12,7 @@ const entrypoints = {
   traceable: "traceable",
   evaluation: "evaluation/index",
   "evaluation/langchain": "evaluation/langchain",
+  "evaluation/llm": "evaluation/llm_evaluator",
   schemas: "schemas",
   langchain: "langchain",
   vercel: "vercel",

diff --git a/js/src/evaluation/llm.ts b/js/src/evaluation/llm.ts
@@ -0,0 +1,5 @@
+export {
+  CategoricalScoreConfig,
+  ContinuousScoreConfig,
+  LLMEvaluator,
+} from "./llm_evaluator.js";
diff --git a/js/src/evaluation/llm_evaluator.ts b/js/src/evaluation/llm_evaluator.ts
@@ -0,0 +1,290 @@
+// eslint-disable-next-line import/no-extraneous-dependencies
+import { ChatPromptTemplate } from "@langchain/core/prompts";
+import * as uuid from "uuid";
+import {
+  EvaluationResult,
+  EvaluationResults,
+  RunEvaluator,
+} from "./evaluator.js";
+import type { Run, Example } from "../schemas.js";
+// eslint-disable-next-line import/no-extraneous-dependencies
+import { BaseLanguageModel } from "@langchain/core/language_models/base";
+
+/**
+ * Configuration for categorical (enum-based) scoring in evaluations.
+ * Used to define discrete categories or labels for evaluation results.
+ */
+export class CategoricalScoreConfig {
+  /** Feedback key for the evaluator */
+  key: string;
+  /** Array of valid categorical choices/labels that can be assigned */
+  choices: string[];
+  /** Description of what this score measures or represents */
+  description: string;
+  /** Optional key for the LLM reasoning/explanation for the score */
+  reasoningKey?: string;
+  /** Optional description of score reasoning, provided to the LLM in the structured output */
+  reasoningDescription?: string;
+
+  /**
+   * Creates a new categorical score configuration
+   * @param params Configuration parameters
+   * @param params.key Feedback key for the evaluator
+   * @param params.choices Array of valid categorical options
+   * @param params.description Description of the scoring criteria
+   * @param params.reasoningKey Optional key for the LLM reasoning/explanation for the score
+   * @param params.reasoningDescription Optional description of score reasoning, provided to the LLM in the structured output
+   */
+  constructor(params: {
+    key: string;
+    choices: string[];
+    description: string;
+    reasoningKey?: string;
+    reasoningDescription?: string;
+  }) {
+    this.key = params.key;
+    this.choices = params.choices;
+    this.description = params.description;
+    this.reasoningKey = params.reasoningKey;
+    this.reasoningDescription = params.reasoningDescription;
+  }
+}
+
+/**
+ * Configuration for continuous (numeric) scoring in evaluations.
+ * Used to define scores that fall within a numeric range.
+ */
+export class ContinuousScoreConfig {
+  /** Feedback key for the evaluator */
+  key: string;
+  /** Minimum allowed score value (defaults to 0) */
+  min: number;
+  /** Maximum allowed score value (defaults to 1) */
+  max: number;
+  /** Description of the scoring criteria */
+  description: string;
+  /** Optional key for the LLM reasoning/explanation for the score */
+  reasoningKey?: string;
+  /** Optional description of score reasoning, provided to the LLM in the structured output */
+  reasoningDescription?: string;
+
+  /**
+   * Creates a new continuous score configuration
+   * @param params Configuration parameters
+   * @param params.key Feedback key for the evaluator
+   * @param params.description Description of the scoring criteria
+   * @param params.min Optional minimum score value (defaults to 0)
+   * @param params.max Optional maximum score value (defaults to 1)
+   * @param params.reasoningKey Optional key for the LLM reasoning/explanation for the score
+   * @param params.reasoningDescription Optional description of score reasoning, provided to the LLM in the structured output
+   */
+  constructor(params: {
+    key: string;
+    description: string;
+    min?: number;
+    max?: number;
+    reasoningKey?: string;
+    reasoningDescription?: string;
+  }) {
+    this.key = params.key;
+    this.min = params.min ?? 0;
+    this.max = params.max ?? 1;
+    this.description = params.description;
+    this.reasoningKey = params.reasoningKey;
+    this.reasoningDescription = params.reasoningDescription;
+  }
+}
+
+type ScoreConfig = CategoricalScoreConfig | ContinuousScoreConfig;
+
+function createScoreJsonSchema(scoreConfig: ScoreConfig): Record<string, any> {
+  const properties: Record<string, any> = {};
+
+  if (scoreConfig.reasoningKey) {
+    properties[scoreConfig.reasoningKey] = {
+      type: "string",
+      description:
+        scoreConfig.reasoningDescription ||
+        "First, think step by step to explain your score.",
+    };
+  }
+
+  if ("choices" in scoreConfig) {
+    properties.value = {
+      type: "string",
+      enum: scoreConfig.choices,
+      description: `The score for the evaluation, one of ${scoreConfig.choices.join(
+        ", "
+      )}.`,
+    };
+  } else {
+    properties.score = {
+      type: "number",
+      minimum: scoreConfig.min,
+      maximum: scoreConfig.max,
+      description: `The score for the evaluation, between ${scoreConfig.min} and ${scoreConfig.max}, inclusive.`,
+    };
+  }
+
+  return {
+    title: scoreConfig.key,
+    description: scoreConfig.description,
+    type: "object",
+    properties,
+    required: scoreConfig.reasoningKey
+      ? ["choices" in scoreConfig ? "value" : "score", scoreConfig.reasoningKey]
+      : ["choices" in scoreConfig ? "value" : "score"],
+  };
+}
+
+interface LLMEvaluatorParams {
+  promptTemplate: string | [string, string][];
+  scoreConfig: ScoreConfig;
+  chatModel: BaseLanguageModel;
+  mapVariables?: (run: Run, example?: Example) => Record<string, any>;
+}
+
+export class LLMEvaluator implements RunEvaluator {
+  prompt: any;
+  mapVariables?: (run: Run, example?: Example) => Record<string, any>;
+  scoreConfig: ScoreConfig;
+  scoreSchema: Record<string, any>;
+  runnable: any;
+
+  constructor() {}
+
+  static async create(params: LLMEvaluatorParams): Promise<LLMEvaluator> {
+    const evaluator = new LLMEvaluator();
+    await evaluator.initialize(
+      params.promptTemplate,
+      params.scoreConfig,
+      params.chatModel,
+      params.mapVariables
+    );
+    return evaluator;
+  }
+  private async initialize(
+    promptTemplate: string | [string, string][],
+    scoreConfig: ScoreConfig,
+    chatModel: BaseLanguageModel,
+    mapVariables?: (run: Run, example?: Example) => Record<string, any>
+  ) {
+    try {
+      // Store the configuration
+      this.scoreConfig = scoreConfig;
+      this.mapVariables = mapVariables;
+
+      // Create the score schema
+      this.scoreSchema = createScoreJsonSchema(scoreConfig);
+
+      // Create the prompt template
+      if (typeof promptTemplate === "string") {
+        this.prompt = ChatPromptTemplate.fromMessages([
+          { role: "human", content: promptTemplate },
+        ]);
+      } else {
+        this.prompt = ChatPromptTemplate.fromMessages(promptTemplate);
+      }
+
+      const modelWithStructuredOutput = chatModel.withStructuredOutput
+        ? chatModel.withStructuredOutput(this.scoreSchema)
+        : null;
+      if (!modelWithStructuredOutput) {
+        throw new Error("Passed chat model must support structured output");
+      }
+      this.runnable = this.prompt.pipe(modelWithStructuredOutput);
+    } catch (e: unknown) {
+      throw new Error(
+        `Failed to initialize LLMEvaluator: ${(e as Error).message}`
+      );
+    }
+  }
+
+  async evaluateRun(
+    run: Run,
+    example?: Example
+  ): Promise<EvaluationResult | EvaluationResults> {
+    const runId = uuid.v4();
+    const variables = this.prepareVariables(run, example);
+    const output = await this.runnable.invoke(variables, { runId: runId });
+
+    return this.parseOutput(output, runId);
+  }
+
+  private prepareVariables(run: Run, example?: Example): Record<string, any> {
+    if (this.mapVariables) {
+      return this.mapVariables(run, example);
+    }
+
+    const variables: Record<string, any> = {};
+
+    // Input handling
+    if (Object.keys(run.inputs).length === 0) {
+      throw new Error(
+        "No input keys are present in run.inputs but the prompt requires 'input'."
+      );
+    }
+    if (Object.keys(run.inputs).length !== 1) {
+      throw new Error(
+        "Multiple input keys are present in run.inputs. Please provide a map_variables function."
+      );
+    }
+    variables.input = Object.values(run.inputs)[0];
+
+    // Output handling
+    if (!run.outputs || Object.keys(run.outputs).length === 0) {
+      throw new Error(
+        "No output keys are present in run.outputs but the prompt requires 'output'."
+      );
+    }
+    if (Object.keys(run.outputs).length !== 1) {
+      throw new Error(
+        "Multiple output keys are present in run.outputs. Please provide a map_variables function."
+      );
+    }
+    variables.output = Object.values(run.outputs)[0];
+
+    // Expected output handling
+    if (example?.outputs) {
+      if (Object.keys(example.outputs).length === 0) {
+        throw new Error(
+          "No output keys are present in example.outputs but the prompt requires 'expected'."
+        );
+      }
+      if (Object.keys(example.outputs).length !== 1) {
+        throw new Error(
+          "Multiple output keys are present in example.outputs. Please provide a map_variables function."
+        );
+      }
+      variables.expected = Object.values(example.outputs)[0];
+    }
+
+    return variables;
+  }
+
+  private parseOutput(
+    output: Record<string, any>,
+    runId: string
+  ): EvaluationResult {
+    const explanation = this.scoreConfig.reasoningKey
+      ? output[this.scoreConfig.reasoningKey]
+      : undefined;
+    if ("choices" in this.scoreConfig) {
+      const value = output.value;
+      return {
+        key: this.scoreConfig.key,
+        value,
+        comment: explanation,
+        sourceRunId: runId,
+      };
+    } else {
+      const score = output.score;
+      return {
+        key: this.scoreConfig.key,
+        score,
+        comment: explanation,
+        sourceRunId: runId,
+      };
+    }
+  }
+}