Use simplified token counting method in case of the big files (#6014)

## Changes Tokenisation of huge files using tiktoken is very costly, and we could save a lot of CPU by simplifying it. Amount of tokens will always be greater than amount of words, so if amount of words exceeds `EXTENDED_USER_CONTEXT_TOKEN_BUDGET` we can just return it instead. For determining if file can be used as user context it will remain equally correct. For other purposes (if any) accuracy may suffer. ## Test plan 1. Build with the JetBrains plugin. 2. Open big file (few MB of text) 3. Select a text fragment and start moving mouse cursor while holding LMB pressed (so changing the selection). Without this changes CPU usage jumps to 100% and stays that way for a minute. With those changes it should drop back to single digit numbers in 2-3 seconds.
sourcegraph · Oct 29, 2024 · 23af99f · 23af99f
1 parent 3091beb
commit 23af99f
Showing 1 changed file with 6 additions and 3 deletions.
diff --git a/lib/shared/src/token/counter.ts b/lib/shared/src/token/counter.ts
@@ -1,7 +1,7 @@
 import { Tiktoken } from 'js-tiktoken/lite'
 import type { TokenBudget, TokenUsage } from '.'
 import type { ChatContextTokenUsage, TokenUsageType } from '.'
-import type { ModelContextWindow } from '..'
+import { EXTENDED_USER_CONTEXT_TOKEN_BUDGET, type ModelContextWindow } from '..'
 import type { Message, PromptString } from '..'
 import { CORPUS_CONTEXT_ALLOCATION } from './constants'
 
@@ -67,11 +67,14 @@ export async function getTokenCounterUtils(): Promise<TokenCounterUtils> {
                     },
 
                     countTokens(text: string): number {
-                        return tokenCounterUtils.encode(text).length
+                        const wordCount = text.trim().split(/\s+/).length
+                        return wordCount > EXTENDED_USER_CONTEXT_TOKEN_BUDGET
+                            ? wordCount
+                            : this.encode(text).length
                     },
 
                     countPromptString(text: PromptString): number {
-                        return tokenCounterUtils.encode(text.toString()).length
+                        return this.countTokens(text.toString())
                     },
 
                     getMessagesTokenCount(messages: Message[]): number {