From d275c9d84cbbadcb09c1bfc60af5d100a532af70 Mon Sep 17 00:00:00 2001 From: Riceball LEE Date: Wed, 17 Jul 2024 08:10:30 +0800 Subject: [PATCH 1/2] feat: add GGMLFileQuantizationType and apply to test - close #794 --- packages/gguf/src/gguf.spec.ts | 18 ++++++++------- packages/gguf/src/gguf.ts | 2 +- packages/gguf/src/types.ts | 41 +++++++++++++++++++++++++++++++++- 3 files changed, 51 insertions(+), 10 deletions(-) diff --git a/packages/gguf/src/gguf.spec.ts b/packages/gguf/src/gguf.spec.ts index ca0eb602e..3fdc1636d 100644 --- a/packages/gguf/src/gguf.spec.ts +++ b/packages/gguf/src/gguf.spec.ts @@ -1,6 +1,6 @@ import { beforeAll, describe, expect, it } from "vitest"; import type { GGUFParseOutput } from "./gguf"; -import { GGMLQuantizationType, gguf, ggufAllShards, parseGgufShardFilename } from "./gguf"; +import { GGMLFileQuantizationType, GGMLQuantizationType, gguf, ggufAllShards, parseGgufShardFilename } from "./gguf"; import fs from "node:fs"; const URL_LLAMA = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/191239b/llama-2-7b-chat.Q2_K.gguf"; @@ -21,9 +21,11 @@ describe("gguf", () => { if (!fs.existsSync(".cache")) { fs.mkdirSync(".cache"); } - const res = await fetch(URL_BIG_METADATA); - const arrayBuf = await res.arrayBuffer(); - fs.writeFileSync(".cache/model.gguf", Buffer.from(arrayBuf)); + if (!fs.existsSync(".cache/model.gguf")) { + const res = await fetch(URL_BIG_METADATA); + const arrayBuf = await res.arrayBuffer(); + fs.writeFileSync(".cache/model.gguf", Buffer.from(arrayBuf)); + } }); it("should parse a llama2 7b", async () => { @@ -37,7 +39,7 @@ describe("gguf", () => { tensor_count: 291n, kv_count: 19n, "general.architecture": "llama", - "general.file_type": 10, + "general.file_type": GGMLFileQuantizationType.Q2_K, "general.name": "LLaMA v2", "general.quantization_version": 2, "llama.attention.head_count": 32, @@ -96,7 +98,7 @@ describe("gguf", () => { tensor_count: 291n, kv_count: 24n, "general.architecture": "llama", - "general.file_type": 17, + "general.file_type": GGMLFileQuantizationType.Q5_K_M, "general.name": "mistralai_mistral-7b-instruct-v0.2", "general.quantization_version": 2, "llama.attention.head_count": 32, @@ -134,7 +136,7 @@ describe("gguf", () => { tensor_count: 164n, kv_count: 21n, "general.architecture": "gemma", - "general.file_type": GGMLQuantizationType.Q8_K, // 15 + "general.file_type": GGMLFileQuantizationType.Q4_K_M, "general.name": "gemma-2b-it", "general.quantization_version": 2, "gemma.attention.head_count": 8, @@ -171,7 +173,7 @@ describe("gguf", () => { tensor_count: 197n, kv_count: 23n, "general.architecture": "bert", - "general.file_type": GGMLQuantizationType.F16, + "general.file_type": GGMLFileQuantizationType.F16, "general.name": "bge-small-en-v1.5", "bert.attention.causal": false, "bert.attention.head_count": 12, diff --git a/packages/gguf/src/gguf.ts b/packages/gguf/src/gguf.ts index 4d32567bf..945d5a494 100644 --- a/packages/gguf/src/gguf.ts +++ b/packages/gguf/src/gguf.ts @@ -4,7 +4,7 @@ import { isBackend } from "./utils/isBackend"; import { promisesQueue } from "./utils/promisesQueue"; export type { MetadataBaseValue, MetadataValue, Version, GGUFMetadata, GGUFTensorInfo, GGUFParseOutput } from "./types"; -export { GGUFValueType, GGMLQuantizationType, Architecture } from "./types"; +export { GGUFValueType, GGMLFileQuantizationType, GGMLQuantizationType, Architecture } from "./types"; export { GGUF_QUANT_DESCRIPTIONS } from "./quant-descriptions"; export const RE_GGUF_FILE = /\.gguf$/; diff --git a/packages/gguf/src/types.ts b/packages/gguf/src/types.ts index f2fbbcdfb..0a6069302 100644 --- a/packages/gguf/src/types.ts +++ b/packages/gguf/src/types.ts @@ -6,6 +6,45 @@ export type MetadataValue = MetadataBaseValue | MetadataBaseValue[] | MetadataVa export type Version = 1 | 2 | 3; +export enum GGMLFileQuantizationType { + F32 = 0, + F16 = 1, + Q4_0 = 2, + Q4_1 = 3, + Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + // Q4_2 = 5, // support has been removed + // Q4_3 = 6, // support has been removed + Q8_0 = 7, + Q5_0 = 8, + Q5_1 = 9, + Q2_K = 10, + Q3_K_S = 11, + Q3_K_M = 12, + Q3_K_L = 13, + Q4_K_S = 14, + Q4_K_M = 15, + Q5_K_S = 16, + Q5_K_M = 17, + Q6_K = 18, + IQ2_XXS = 19, + IQ2_XS = 20, + Q2_K_S = 21, + IQ3_XS = 22, + IQ3_XXS = 23, + IQ1_S = 24, + IQ4_NL = 25, + IQ3_S = 26, + IQ3_M = 27, + IQ2_S = 28, + IQ2_M = 29, + IQ4_XS = 30, + IQ1_M = 31, + BF16 = 32, + Q4_0_4_4 = 33, + Q4_0_4_8 = 34, + Q4_0_8_8 = 35, +} + export enum GGMLQuantizationType { F32 = 0, F16 = 1, @@ -60,7 +99,7 @@ export type Architecture = (typeof ARCHITECTURES)[number]; export interface GGUFGeneralInfo { "general.architecture": TArchitecture; "general.name"?: string; - "general.file_type"?: number; + "general.file_type"?: GGMLFileQuantizationType; "general.quantization_version"?: number; } From d300964729f46812c490bab165db2ca827e21a9e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 16 Aug 2024 12:30:17 +0200 Subject: [PATCH 2/2] add MOSTLY_ prefix --- packages/gguf/src/gguf.spec.ts | 8 ++-- packages/gguf/src/types.ts | 72 +++++++++++++++++----------------- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/packages/gguf/src/gguf.spec.ts b/packages/gguf/src/gguf.spec.ts index 3fdc1636d..eb74fc5d6 100644 --- a/packages/gguf/src/gguf.spec.ts +++ b/packages/gguf/src/gguf.spec.ts @@ -39,7 +39,7 @@ describe("gguf", () => { tensor_count: 291n, kv_count: 19n, "general.architecture": "llama", - "general.file_type": GGMLFileQuantizationType.Q2_K, + "general.file_type": GGMLFileQuantizationType.MOSTLY_Q2_K, "general.name": "LLaMA v2", "general.quantization_version": 2, "llama.attention.head_count": 32, @@ -98,7 +98,7 @@ describe("gguf", () => { tensor_count: 291n, kv_count: 24n, "general.architecture": "llama", - "general.file_type": GGMLFileQuantizationType.Q5_K_M, + "general.file_type": GGMLFileQuantizationType.MOSTLY_Q5_K_M, "general.name": "mistralai_mistral-7b-instruct-v0.2", "general.quantization_version": 2, "llama.attention.head_count": 32, @@ -136,7 +136,7 @@ describe("gguf", () => { tensor_count: 164n, kv_count: 21n, "general.architecture": "gemma", - "general.file_type": GGMLFileQuantizationType.Q4_K_M, + "general.file_type": GGMLFileQuantizationType.MOSTLY_Q4_K_M, "general.name": "gemma-2b-it", "general.quantization_version": 2, "gemma.attention.head_count": 8, @@ -173,7 +173,7 @@ describe("gguf", () => { tensor_count: 197n, kv_count: 23n, "general.architecture": "bert", - "general.file_type": GGMLFileQuantizationType.F16, + "general.file_type": GGMLFileQuantizationType.MOSTLY_F16, "general.name": "bge-small-en-v1.5", "bert.attention.causal": false, "bert.attention.head_count": 12, diff --git a/packages/gguf/src/types.ts b/packages/gguf/src/types.ts index 0a6069302..02872b95c 100644 --- a/packages/gguf/src/types.ts +++ b/packages/gguf/src/types.ts @@ -7,42 +7,42 @@ export type MetadataValue = MetadataBaseValue | MetadataBaseValue[] | MetadataVa export type Version = 1 | 2 | 3; export enum GGMLFileQuantizationType { - F32 = 0, - F16 = 1, - Q4_0 = 2, - Q4_1 = 3, - Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - // Q4_2 = 5, // support has been removed - // Q4_3 = 6, // support has been removed - Q8_0 = 7, - Q5_0 = 8, - Q5_1 = 9, - Q2_K = 10, - Q3_K_S = 11, - Q3_K_M = 12, - Q3_K_L = 13, - Q4_K_S = 14, - Q4_K_M = 15, - Q5_K_S = 16, - Q5_K_M = 17, - Q6_K = 18, - IQ2_XXS = 19, - IQ2_XS = 20, - Q2_K_S = 21, - IQ3_XS = 22, - IQ3_XXS = 23, - IQ1_S = 24, - IQ4_NL = 25, - IQ3_S = 26, - IQ3_M = 27, - IQ2_S = 28, - IQ2_M = 29, - IQ4_XS = 30, - IQ1_M = 31, - BF16 = 32, - Q4_0_4_4 = 33, - Q4_0_4_8 = 34, - Q4_0_8_8 = 35, + MOSTLY_F32 = 0, + MOSTLY_F16 = 1, + MOSTLY_Q4_0 = 2, + MOSTLY_Q4_1 = 3, + MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + // MOSTLY_Q4_2 = 5, // support has been removed + // MOSTLY_Q4_3 = 6, // support has been removed + MOSTLY_Q8_0 = 7, + MOSTLY_Q5_0 = 8, + MOSTLY_Q5_1 = 9, + MOSTLY_Q2_K = 10, + MOSTLY_Q3_K_S = 11, + MOSTLY_Q3_K_M = 12, + MOSTLY_Q3_K_L = 13, + MOSTLY_Q4_K_S = 14, + MOSTLY_Q4_K_M = 15, + MOSTLY_Q5_K_S = 16, + MOSTLY_Q5_K_M = 17, + MOSTLY_Q6_K = 18, + MOSTLY_IQ2_XXS = 19, + MOSTLY_IQ2_XS = 20, + MOSTLY_Q2_K_S = 21, + MOSTLY_IQ3_XS = 22, + MOSTLY_IQ3_XXS = 23, + MOSTLY_IQ1_S = 24, + MOSTLY_IQ4_NL = 25, + MOSTLY_IQ3_S = 26, + MOSTLY_IQ3_M = 27, + MOSTLY_IQ2_S = 28, + MOSTLY_IQ2_M = 29, + MOSTLY_IQ4_XS = 30, + MOSTLY_IQ1_M = 31, + MOSTLY_BF16 = 32, + MOSTLY_Q4_0_4_4 = 33, + MOSTLY_Q4_0_4_8 = 34, + MOSTLY_Q4_0_8_8 = 35, } export enum GGMLQuantizationType {