huggingface · radames · Mar 25, 2024 · Mar 25, 2024 · Mar 26, 2024 · Mar 26, 2024
diff --git a/packages/inference/README.md b/packages/inference/README.md
@@ -53,9 +53,10 @@ import { textGeneration } from "@huggingface/inference";
 
 await textGeneration({
   accessToken: "hf_...",
-  model: "model_or_endpoint",
+  model: "model",
   inputs: ...,
-  parameters: ...
+  parameters: ...,
+  endpointUrl: "custom endpoint url",
 })
 ```
 

diff --git a/packages/inference/src/HfInference.ts b/packages/inference/src/HfInference.ts
@@ -14,9 +14,9 @@ type TaskWithNoAccessToken = {
 	) => ReturnType<Task[key]>;
 };
 
-type TaskWithNoAccessTokenNoModel = {
+type TaskWithNoAccessTokenNoEndpointUrl = {
 	[key in keyof Task]: (
-		args: DistributiveOmit<Parameters<Task[key]>[0], "accessToken" | "model">,
+		args: DistributiveOmit<Parameters<Task[key]>[0], "accessToken" | "endpointUrl">,
 		options?: Parameters<Task[key]>[1]
 	) => ReturnType<Task[key]>;
 };
@@ -57,12 +57,12 @@ export class HfInferenceEndpoint {
 				enumerable: false,
 				value: (params: RequestArgs, options: Options) =>
 					// eslint-disable-next-line @typescript-eslint/no-explicit-any
-					fn({ ...params, accessToken, model: endpointUrl } as any, { ...defaultOptions, ...options }),
+					fn({ ...params, accessToken, endpointUrl } as any, { ...defaultOptions, ...options }),
 			});
 		}
 	}
 }
 
 export interface HfInference extends TaskWithNoAccessToken {}
 
-export interface HfInferenceEndpoint extends TaskWithNoAccessTokenNoModel {}
+export interface HfInferenceEndpoint extends TaskWithNoAccessTokenNoEndpointUrl {}
diff --git a/packages/inference/src/lib/makeRequestOptions.ts b/packages/inference/src/lib/makeRequestOptions.ts
@@ -1,4 +1,6 @@
 import type { InferenceTask, Options, RequestArgs } from "../types";
+import { isObjectEmpty } from "../utils/isEmpty";
+import { omit } from "../utils/omit";
 import { HF_HUB_URL } from "./getDefaultTask";
 import { isUrl } from "./isUrl";
 
@@ -24,8 +26,7 @@ export async function makeRequestOptions(
 		taskHint?: InferenceTask;
 	}
 ): Promise<{ url: string; info: RequestInit }> {
-	// eslint-disable-next-line @typescript-eslint/no-unused-vars
-	const { accessToken, model: _model, ...otherArgs } = args;
+	const { accessToken, endpointUrl, ...otherArgs } = args;
 	let { model } = args;
 	const {
 		forceTask: task,
@@ -78,10 +79,16 @@ export async function makeRequestOptions(
 	}
 
 	const url = (() => {
+		if (endpointUrl && isUrl(model)) {
+			throw new TypeError("Both model and endpointUrl cannot be URLs");
+		}
 		if (isUrl(model)) {
+			console.warn("Using a model URL is deprecated, please use the `endpointUrl` parameter instead");
 			return model;
 		}
-
+		if (endpointUrl) {
+			return endpointUrl;
+		}
 		if (task) {
 			return `${HF_INFERENCE_API_BASE_URL}/pipeline/${task}/${model}`;
 		}
@@ -103,19 +110,17 @@ export async function makeRequestOptions(
 	} else if (includeCredentials === undefined) {
 		credentials = "same-origin";
 	}
-
 	const info: RequestInit = {
 		headers,
 		method: "POST",
 		body: binary
 			? args.data
 			: JSON.stringify({
-					...otherArgs,
-					options: options && otherOptions,
+					...(otherArgs.model && isUrl(otherArgs.model) ? omit(otherArgs, "model") : otherArgs),
+					...(otherOptions && !isObjectEmpty(otherOptions) && { options: otherOptions }),
 			  }),
 		credentials,
 		signal: options?.signal,
 	};
-
 	return { url, info };
 }
diff --git a/packages/inference/src/tasks/custom/streamingRequest.ts b/packages/inference/src/tasks/custom/streamingRequest.ts
@@ -67,6 +67,9 @@ export async function* streamingRequest<T>(
 			onChunk(value);
 			for (const event of events) {
 				if (event.data.length > 0) {
+					if (event.data === "[DONE]") {
+						return;
+					}
 					const data = JSON.parse(event.data);
 					if (typeof data === "object" && data !== null && "error" in data) {
 						throw new Error(data.error);

diff --git a/packages/inference/src/tasks/nlp/textGeneration.ts b/packages/inference/src/tasks/nlp/textGeneration.ts
@@ -14,6 +14,9 @@ export async function textGeneration(
 		...options,
 		taskHint: "text-generation",
 	});
+	if (typeof res === "object" && res.hasOwnProperty("choices")) {
+		return res;
+	}
 	const isValidOutput = Array.isArray(res) && res.every((x) => typeof x?.generated_text === "string");
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected Array<{generated_text: string}>");

diff --git a/packages/inference/src/tasks/nlp/textGenerationStream.ts b/packages/inference/src/tasks/nlp/textGenerationStream.ts
@@ -80,6 +80,29 @@ export interface TextGenerationStreamOutput {
 	 * Only available when the generation is finished
 	 */
 	details: TextGenerationStreamDetails | null;
+	/**
+	 * If Message API compatible
+	 */
+	choices?: Choice[];
+}
+
+export interface Choice {
+	index: number;
+	delta: {
+		role: string;
+		content?: string;
+		tool_calls?: {
+			index: number;
+			id: string;
+			type: string;
+			function: {
+				name?: string;
+				arguments: string;
+			};
+		};
+	};
+	logprobs?: Record<string, unknown>;
+	finish_reason?: string;
 }
 
 /**

diff --git a/packages/inference/src/types.ts b/packages/inference/src/types.ts
@@ -47,15 +47,29 @@ export interface BaseArgs {
 	 */
 	accessToken?: string;
 	/**
-	 * The model to use. Can be a full URL for a dedicated inference endpoint.
+	 * The model to use.
 	 *
 	 * If not specified, will call huggingface.co/api/tasks to get the default model for the task.
+	 *
+	 * /!\ Legacy behavior allows this to be an URL, but this is deprecated and will be removed in the future.
+	 * Use the `endpointUrl` parameter instead.
 	 */
 	model?: string;
+
+	/**
+	 * The URL of the endpoint to use. If not specified, will call huggingface.co/api/tasks to get the default endpoint for the task.
+	 *
+	 * If specified, will use this URL instead of the default one.
+	 */
+	endpointUrl?: string;
 }
 
 export type RequestArgs = BaseArgs &
-	({ data: Blob | ArrayBuffer } | { inputs: unknown }) & {
+	(
+		| { data: Blob | ArrayBuffer }
+		| { inputs: unknown }
+		| { messages?: Array<{ role: "user" | "assistant"; content: string }> }
+	) & {
 		parameters?: Record<string, unknown>;
 		accessToken?: string;
 	};
diff --git a/packages/inference/src/utils/isEmpty.ts b/packages/inference/src/utils/isEmpty.ts
@@ -0,0 +1,8 @@
+export function isObjectEmpty(object: object): boolean {
+	for (const prop in object) {
+		if (Object.prototype.hasOwnProperty.call(object, prop)) {
+			return false;
+		}
+	}
+	return true;
+}
diff --git a/packages/inference/test/HfInference.spec.ts b/packages/inference/test/HfInference.spec.ts
@@ -203,34 +203,31 @@ describe.concurrent(
 		});
 
 		it("textGenerationStream - google/flan-t5-xxl", async () => {
-			const phrase = "one two three four";
 			const response = hf.textGenerationStream({
 				model: "google/flan-t5-xxl",
-				inputs: `repeat "${phrase}"`,
+				inputs: "Please answer the following question: complete one two and ____.",
 			});
 
 			const makeExpectedReturn = (tokenText: string, fullPhrase: string): TextGenerationStreamOutput => {
-				const eot = tokenText === "</s>";
+				const eot = tokenText === "</s>" || tokenText === null;
 				return {
 					details: null,
 					token: {
 						id: expect.any(Number),
 						logprob: expect.any(Number),
-						text: expect.stringContaining(tokenText),
-						special: eot,
+						text: expect.any(String) || null,
+						special: expect.any(Boolean),
 					},
 					generated_text: eot ? fullPhrase : null,
 				};
 			};
-
-			const expectedTokens = phrase.split(" ");
-			// eot token
-			expectedTokens.push("</s>");
+			const word = "three";
+			const expectedTokens = [word, "</s>"];
 
 			for await (const ret of response) {
 				const expectedToken = expectedTokens.shift();
 				assert(expectedToken);
-				expect(ret).toMatchObject(makeExpectedReturn(expectedToken, phrase));
+				expect(ret).toMatchObject(makeExpectedReturn(expectedToken, word));
 			}
 		});
 
@@ -244,7 +241,7 @@ describe.concurrent(
 			});
 
 			await expect(response.next()).rejects.toThrow(
-				"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 4096. Given: 17 `inputs` tokens and 10000 `max_new_tokens`"
+				"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 17 `inputs` tokens and 10000 `max_new_tokens`"
 			);
 		});
 
@@ -651,6 +648,88 @@ describe.concurrent(
 			});
 			expect(generated_text).toEqual("three");
 		});
+
+		it("textGeneration - OpenAI Specs", async () => {
+			const ep = hf.endpoint(
+				"https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2/v1/chat/completions"
+			);
+			const res = await ep.textGeneration({
+				model: "tgi",
+				messages: [{ role: "user", content: "Complete the this sentence with words one plus one is equal " }],
+				parameters: {
+					max_tokens: 500,
+					return_full_text: false,
+					temperature: 0.0,
+					seed: 0,
+				},
+			});
+			if (res.choices && res.choices.length > 0) {
+				const completion = res.choices[0].message.content;
+				expect(completion).toContain(" One plus one is equal to two.");
+			}
+		});
+		it("textGenerationStream - OpenAI Specs", async () => {
+			const ep = hf.endpoint(
+				"https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2/v1/chat/completions"
+			);
+			const stream = ep.textGenerationStream({
+				model: "tgi",
+				messages: [{ role: "user", content: "Complete the equation 1+1= ,just the answer" }],
+				parameters: {
+					max_tokens: 500,
+					return_full_text: false,
+					temperature: 0.0,
+					seed: 0,
+				},
+			});
+			let out = "";
+			for await (const chunk of stream) {
+				if (chunk.choices && chunk.choices.length > 0) {
+					out += chunk.choices[0].delta.content;
+				}
+			}
+			expect(out).toContain("The answer to the equation 1 + 1 is 2.</s>");
+		});
+		it("mistral - OpenAI Specs", async () => {
+			const MISTRAL_KEY = env.MISTRAL_KEY;
+			if (!MISTRAL_KEY) {
+				console.warn("Skipping test because MISTRAL_KEY is not set");
+				return;
+			}
+			const hf = new HfInference(MISTRAL_KEY);
+			const ep = hf.endpoint("https://api.mistral.ai/v1/chat/completions");
+			const stream = ep.streamingRequest({
+				model: "mistral-tiny",
+				messages: [{ role: "user", content: "Complete the equation one + one = , just the answer" }],
+			}) as AsyncGenerator<TextGenerationStreamOutput>;
+			let out = "";
+			for await (const chunk of stream) {
+				if (chunk.choices && chunk.choices.length > 0) {
+					out += chunk.choices[0].delta.content;
+				}
+			}
+			expect(out).toContain("The answer to the equation one + one is two.");
+		});
+		it("openai - OpenAI Specs", async () => {
+			const OPENAI_KEY = env.OPENAI_KEY;
+			if (!OPENAI_KEY) {
+				console.warn("Skipping test because OPENAI_KEY is not set");
+				return;
+			}
+			const hf = new HfInference(OPENAI_KEY);
+			const ep = hf.endpoint("https://api.openai.com/v1/chat/completions");
+			const stream = ep.streamingRequest({
+				model: "gpt-3.5-turbo",
+				messages: [{ role: "user", content: "Complete the equation one + one =" }],
+			}) as AsyncGenerator<TextGenerationStreamOutput>;
+			let out = "";
+			for await (const chunk of stream) {
+				if (chunk.choices && chunk.choices.length > 0) {
+					out += chunk.choices[0].delta.content;
+				}
+			}
+			expect(out).toContain("two");
+		});
 	},
 	TIMEOUT
 );