Skip to content

Commit

Permalink
feat: add GGMLFileQuantizationType and apply to test (#806)
Browse files Browse the repository at this point in the history
@mishig25 that's it for #794

---------

Co-authored-by: Xuan Son Nguyen <[email protected]>
  • Loading branch information
snowyu and ngxson authored Aug 16, 2024
1 parent 6cd5358 commit 1140e0c
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 10 deletions.
18 changes: 10 additions & 8 deletions packages/gguf/src/gguf.spec.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { beforeAll, describe, expect, it } from "vitest";
import type { GGUFParseOutput } from "./gguf";
import { GGMLQuantizationType, gguf, ggufAllShards, parseGgufShardFilename } from "./gguf";
import { GGMLFileQuantizationType, GGMLQuantizationType, gguf, ggufAllShards, parseGgufShardFilename } from "./gguf";
import fs from "node:fs";

const URL_LLAMA = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/191239b/llama-2-7b-chat.Q2_K.gguf";
Expand All @@ -21,9 +21,11 @@ describe("gguf", () => {
if (!fs.existsSync(".cache")) {
fs.mkdirSync(".cache");
}
const res = await fetch(URL_BIG_METADATA);
const arrayBuf = await res.arrayBuffer();
fs.writeFileSync(".cache/model.gguf", Buffer.from(arrayBuf));
if (!fs.existsSync(".cache/model.gguf")) {
const res = await fetch(URL_BIG_METADATA);
const arrayBuf = await res.arrayBuffer();
fs.writeFileSync(".cache/model.gguf", Buffer.from(arrayBuf));
}
});

it("should parse a llama2 7b", async () => {
Expand All @@ -37,7 +39,7 @@ describe("gguf", () => {
tensor_count: 291n,
kv_count: 19n,
"general.architecture": "llama",
"general.file_type": 10,
"general.file_type": GGMLFileQuantizationType.MOSTLY_Q2_K,
"general.name": "LLaMA v2",
"general.quantization_version": 2,
"llama.attention.head_count": 32,
Expand Down Expand Up @@ -96,7 +98,7 @@ describe("gguf", () => {
tensor_count: 291n,
kv_count: 24n,
"general.architecture": "llama",
"general.file_type": 17,
"general.file_type": GGMLFileQuantizationType.MOSTLY_Q5_K_M,
"general.name": "mistralai_mistral-7b-instruct-v0.2",
"general.quantization_version": 2,
"llama.attention.head_count": 32,
Expand Down Expand Up @@ -134,7 +136,7 @@ describe("gguf", () => {
tensor_count: 164n,
kv_count: 21n,
"general.architecture": "gemma",
"general.file_type": GGMLQuantizationType.Q8_K, // 15
"general.file_type": GGMLFileQuantizationType.MOSTLY_Q4_K_M,
"general.name": "gemma-2b-it",
"general.quantization_version": 2,
"gemma.attention.head_count": 8,
Expand Down Expand Up @@ -171,7 +173,7 @@ describe("gguf", () => {
tensor_count: 197n,
kv_count: 23n,
"general.architecture": "bert",
"general.file_type": GGMLQuantizationType.F16,
"general.file_type": GGMLFileQuantizationType.MOSTLY_F16,
"general.name": "bge-small-en-v1.5",
"bert.attention.causal": false,
"bert.attention.head_count": 12,
Expand Down
2 changes: 1 addition & 1 deletion packages/gguf/src/gguf.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { isBackend } from "./utils/isBackend";
import { promisesQueue } from "./utils/promisesQueue";

export type { MetadataBaseValue, MetadataValue, Version, GGUFMetadata, GGUFTensorInfo, GGUFParseOutput } from "./types";
export { GGUFValueType, GGMLQuantizationType, Architecture } from "./types";
export { GGUFValueType, GGMLFileQuantizationType, GGMLQuantizationType, Architecture } from "./types";
export { GGUF_QUANT_DESCRIPTIONS } from "./quant-descriptions";

export const RE_GGUF_FILE = /\.gguf$/;
Expand Down
41 changes: 40 additions & 1 deletion packages/gguf/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,45 @@ export type MetadataValue = MetadataBaseValue | MetadataBaseValue[] | MetadataVa

export type Version = 1 | 2 | 3;

export enum GGMLFileQuantizationType {
MOSTLY_F32 = 0,
MOSTLY_F16 = 1,
MOSTLY_Q4_0 = 2,
MOSTLY_Q4_1 = 3,
MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
// MOSTLY_Q4_2 = 5, // support has been removed
// MOSTLY_Q4_3 = 6, // support has been removed
MOSTLY_Q8_0 = 7,
MOSTLY_Q5_0 = 8,
MOSTLY_Q5_1 = 9,
MOSTLY_Q2_K = 10,
MOSTLY_Q3_K_S = 11,
MOSTLY_Q3_K_M = 12,
MOSTLY_Q3_K_L = 13,
MOSTLY_Q4_K_S = 14,
MOSTLY_Q4_K_M = 15,
MOSTLY_Q5_K_S = 16,
MOSTLY_Q5_K_M = 17,
MOSTLY_Q6_K = 18,
MOSTLY_IQ2_XXS = 19,
MOSTLY_IQ2_XS = 20,
MOSTLY_Q2_K_S = 21,
MOSTLY_IQ3_XS = 22,
MOSTLY_IQ3_XXS = 23,
MOSTLY_IQ1_S = 24,
MOSTLY_IQ4_NL = 25,
MOSTLY_IQ3_S = 26,
MOSTLY_IQ3_M = 27,
MOSTLY_IQ2_S = 28,
MOSTLY_IQ2_M = 29,
MOSTLY_IQ4_XS = 30,
MOSTLY_IQ1_M = 31,
MOSTLY_BF16 = 32,
MOSTLY_Q4_0_4_4 = 33,
MOSTLY_Q4_0_4_8 = 34,
MOSTLY_Q4_0_8_8 = 35,
}

export enum GGMLQuantizationType {
F32 = 0,
F16 = 1,
Expand Down Expand Up @@ -60,7 +99,7 @@ export type Architecture = (typeof ARCHITECTURES)[number];
export interface GGUFGeneralInfo<TArchitecture extends Architecture> {
"general.architecture": TArchitecture;
"general.name"?: string;
"general.file_type"?: number;
"general.file_type"?: GGMLFileQuantizationType;
"general.quantization_version"?: number;
}

Expand Down

0 comments on commit 1140e0c

Please sign in to comment.