fix: enhancing PDF feature

hoarder-app · Apr 11, 2024 · beeaf5a · beeaf5a
1 parent 4520eb6
commit beeaf5a
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 33 deletions.
diff --git a/apps/workers/openaiWorker.ts b/apps/workers/openaiWorker.ts
@@ -77,32 +77,22 @@ export class OpenAiWorker {
   }
 }
 
-function promptFactory(type: "text" | "web" | "pdf" | "image") {
-  const typeContent = {
-    text: "User Note",
-    web: "HTML page",
-    pdf: "PDF file",
-    image: "Image",
-  };
-  return `I'm building a read-it-later app and I need your help with automatic tagging.
-${
-  type === "web" || type === "pdf" || type === "text"
-    ? `You are currently analyzing the content of a ${typeContent[type]}, please analyze the content after the sentence "CONTENT START HERE:"`
-    : `Please analyze the attached image`
-}
-Suggest relevant tags that describe its key themes, topics, and main ideas.
+const IMAGE_PROMPT_BASE = `
+I'm building a read-it-later app and I need your help with automatic tagging.
+Please analyze the attached image and suggest relevant tags that describe its key themes, topics, and main ideas.
 Aim for a variety of tags, including broad categories, specific keywords, and potential sub-genres. The tags language must be ${serverConfig.inference.inferredTagLang}.
-If the tag is not generic enough, don't include it. Aim for 5-8 tags.
-If there are no good tags, don't emit any.
+If the tag is not generic enough, don't include it. Aim for 10-15 tags. If there are no good tags, don't emit any. You must respond in valid JSON
+with the key "tags" and the value is list of tags. Don't wrap the response in a markdown code.`;
+
+const TEXT_PROMPT_BASE = `
+I'm building a read-it-later app and I need your help with automatic tagging.
+Please analyze the text after the sentence "CONTENT START HERE:" and suggest relevant tags that describe its key themes, topics, and main ideas.
+Aim for a variety of tags, including broad categories, specific keywords, and potential sub-genres. The tags language must be ${serverConfig.inference.inferredTagLang}. If it's a famous website
+you may also include a tag for the website. If the tag is not generic enough, don't include it. Aim for 3-5 tags. If there are no good tags, don't emit any.
+The content can include text for cookie consent and privacy policy, ignore those while tagging.
 You must respond in JSON with the key "tags" and the value is list of tags.
-In addition to the tags key, you should include a description key which includes a text that describes the content of the ${typeContent[type]}.
-Don't wrap the response in a markdown code.`;
-}
-
-const TEXT_PROMPT = promptFactory("text");
-const WEB_PROMPT = promptFactory("web");
-const IMAGE_PROMPT = promptFactory("image");
-const PDF_PROMPT = promptFactory("pdf");
+CONTENT START HERE:
+`;
 
 function buildPrompt(
   bookmark: NonNullable<Awaited<ReturnType<typeof fetchBookmark>>>,
@@ -119,7 +109,7 @@ function buildPrompt(
       content = truncateContent(content);
     }
     return `
-${WEB_PROMPT}
+${TEXT_PROMPT_BASE}
 URL: ${bookmark.link.url}
 Title: ${bookmark.link.title ?? ""}
 Description: ${bookmark.link.description ?? ""}
@@ -131,7 +121,7 @@ Content: ${content ?? ""}
     const content = truncateContent(bookmark.text.text ?? "");
     // TODO: Ensure that the content doesn't exceed the context length of openai
     return `
-${TEXT_PROMPT}
+${TEXT_PROMPT_BASE}
 ${content}
   `;
   }
@@ -167,7 +157,7 @@ async function inferTagsFromImage(
   }
   const base64 = asset.toString("base64");
   return inferenceClient.inferFromImage(
-    IMAGE_PROMPT,
+    IMAGE_PROMPT_BASE,
     metadata.contentType,
     base64,
   );
@@ -202,7 +192,7 @@ async function inferTagsFromPDF(
     })
     .where(eq(bookmarkAssets.id, bookmark.id));
 
-  const prompt = `${PDF_PROMPT}
+  const prompt = `${TEXT_PROMPT_BASE}
 Content: ${truncateContent(pdfParse.text)}
 `;
   return inferenceClient.inferFromText(prompt);

diff --git a/apps/workers/searchWorker.ts b/apps/workers/searchWorker.ts
@@ -76,7 +76,6 @@ async function runIndex(
       ...(bookmark.asset
         ? {
             content: bookmark.asset.content,
-            info: bookmark.asset.info,
             metadata: bookmark.asset.metadata,
           }
         : undefined),

diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts
@@ -19,16 +19,14 @@ export function withTimeout<T, Ret>(
 
 export async function readPDFText(buffer: Buffer): Promise<{
   text: string;
-  metadata: Record<string, string | Record<string, string>>;
+  metadata: Record<string, string>;
 }> {
   return new Promise((resolve, reject) => {
     // Need raw text flag represents as number (1), reference : https://github.com/modesty/pdf2json/issues/76#issuecomment-236569265
     const pdfParser = new PDFParser(null, 1);
     pdfParser.on("pdfParser_dataError", reject);
     pdfParser.on("pdfParser_dataReady", (pdfData) => {
-      console.log(pdfParser);
       // eslint-disable-next-line
-      console.log((pdfParser as any).getRawTextContent());
       resolve({
         // The type isn't set correctly, reference : https://github.com/modesty/pdf2json/issues/327
         // eslint-disable-next-line

diff --git a/packages/shared/search.ts b/packages/shared/search.ts
@@ -11,7 +11,7 @@ export const zBookmarkIdxSchema = z.object({
   title: z.string().nullish(),
   description: z.string().nullish(),
   content: z.string().nullish(),
-  metadata: z.record(z.string()).nullish(),
+  metadata: z.string().nullish(),
   fileName: z.string().nullish(),
   createdAt: z.string().nullish(),
   note: z.string().nullish(),