From 4ded31a83debe5b671d4a19c6e432667a4b328dc Mon Sep 17 00:00:00 2001
From: Pau Ramon Revilla <masylum@gmail.com>
Date: Mon, 17 Jun 2024 12:14:07 +0200
Subject: [PATCH] Better normalization cache

They key seems to be too specific. Specially by using the prop, which
basically makes it redudant to cache tokens that are found in different
props. The goal of that cache seems to be to trade memory for time, but
right now seems to be storing equal computations in different keys which
basically is inefficient. The only thing that the prop is needed for is
the `stemmerSkipProperties`.
---
 packages/orama/src/components/tokenizer/index.ts | 12 +++++-------
 packages/orama/tests/search.test.ts              |  2 +-
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/packages/orama/src/components/tokenizer/index.ts b/packages/orama/src/components/tokenizer/index.ts
index 1e5b4c4a5..82dc94f3b 100644
--- a/packages/orama/src/components/tokenizer/index.ts
+++ b/packages/orama/src/components/tokenizer/index.ts
@@ -16,15 +16,13 @@ export interface DefaultTokenizer extends Tokenizer {
 }
 
 export function normalizeToken(this: DefaultTokenizer, prop: string, token: string): string {
-  const key = `${this.language}:${prop}:${token}`
-
-  if (this.normalizationCache.has(key)) {
-    return this.normalizationCache.get(key)!
+  if (this.normalizationCache.has(token)) {
+    return this.normalizationCache.get(token)!
   }
 
   // Remove stopwords if enabled
-  if (this.stopWords?.includes(token)) {
-    this.normalizationCache.set(key, '')
+  if (this.stopWords?.has(token)) {
+    this.normalizationCache.set(token, '')
     return ''
   }
 
@@ -34,7 +32,7 @@ export function normalizeToken(this: DefaultTokenizer, prop: string, token: stri
   }
 
   token = replaceDiacritics(token)
-  this.normalizationCache.set(key, token)
+  this.normalizationCache.set(token, token)
   return token
 }
 
diff --git a/packages/orama/tests/search.test.ts b/packages/orama/tests/search.test.ts
index a4c905725..83008f7c8 100644
--- a/packages/orama/tests/search.test.ts
+++ b/packages/orama/tests/search.test.ts
@@ -725,7 +725,7 @@ t.test('search method', (t) => {
   t.test('with custom tokenizer', async (t) => {
     t.plan(4)
 
-    const normalizationCache = new Map([['english:foo:dogs', 'Dogs']])
+    const normalizationCache = new Map([['dogs', 'Dogs']])
 
     const db = await create({
       schema: {