From 4ded31a83debe5b671d4a19c6e432667a4b328dc Mon Sep 17 00:00:00 2001 From: Pau Ramon Revilla Date: Mon, 17 Jun 2024 12:14:07 +0200 Subject: [PATCH] Better normalization cache They key seems to be too specific. Specially by using the prop, which basically makes it redudant to cache tokens that are found in different props. The goal of that cache seems to be to trade memory for time, but right now seems to be storing equal computations in different keys which basically is inefficient. The only thing that the prop is needed for is the `stemmerSkipProperties`. --- packages/orama/src/components/tokenizer/index.ts | 12 +++++------- packages/orama/tests/search.test.ts | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/packages/orama/src/components/tokenizer/index.ts b/packages/orama/src/components/tokenizer/index.ts index 1e5b4c4a5..82dc94f3b 100644 --- a/packages/orama/src/components/tokenizer/index.ts +++ b/packages/orama/src/components/tokenizer/index.ts @@ -16,15 +16,13 @@ export interface DefaultTokenizer extends Tokenizer { } export function normalizeToken(this: DefaultTokenizer, prop: string, token: string): string { - const key = `${this.language}:${prop}:${token}` - - if (this.normalizationCache.has(key)) { - return this.normalizationCache.get(key)! + if (this.normalizationCache.has(token)) { + return this.normalizationCache.get(token)! } // Remove stopwords if enabled - if (this.stopWords?.includes(token)) { - this.normalizationCache.set(key, '') + if (this.stopWords?.has(token)) { + this.normalizationCache.set(token, '') return '' } @@ -34,7 +32,7 @@ export function normalizeToken(this: DefaultTokenizer, prop: string, token: stri } token = replaceDiacritics(token) - this.normalizationCache.set(key, token) + this.normalizationCache.set(token, token) return token } diff --git a/packages/orama/tests/search.test.ts b/packages/orama/tests/search.test.ts index a4c905725..83008f7c8 100644 --- a/packages/orama/tests/search.test.ts +++ b/packages/orama/tests/search.test.ts @@ -725,7 +725,7 @@ t.test('search method', (t) => { t.test('with custom tokenizer', async (t) => { t.plan(4) - const normalizationCache = new Map([['english:foo:dogs', 'Dogs']]) + const normalizationCache = new Map([['dogs', 'Dogs']]) const db = await create({ schema: {