diff --git a/packages/cdk/bin/generative-ai-use-cases.ts b/packages/cdk/bin/generative-ai-use-cases.ts index 56368db0..bf790e11 100644 --- a/packages/cdk/bin/generative-ai-use-cases.ts +++ b/packages/cdk/bin/generative-ai-use-cases.ts @@ -54,7 +54,7 @@ const anonymousUsageTracking: boolean = !!app.node.tryGetContext( ); const vpcId = app.node.tryGetContext('vpcId'); -if (typeof vpcId != 'undefined' && vpcId != null && typeof vpcId != 'string' ) { +if (typeof vpcId != 'undefined' && vpcId != null && typeof vpcId != 'string') { throw new Error('vpcId must be string or undefined'); } if (typeof vpcId == 'string' && !vpcId.match(/^vpc-/)) { diff --git a/packages/web/src/@types/navigate.d.ts b/packages/web/src/@types/navigate.d.ts index 26f6733e..4abcd505 100644 --- a/packages/web/src/@types/navigate.d.ts +++ b/packages/web/src/@types/navigate.d.ts @@ -62,3 +62,7 @@ export type WebContentPageQueryParams = BaseQueryParams & { url?: string; context?: string; }; + +export type VideoAnalyzerPageQueryParams = BaseQueryParams & { + content: string; +}; diff --git a/packages/web/src/App.tsx b/packages/web/src/App.tsx index 3b888565..3020bbf8 100644 --- a/packages/web/src/App.tsx +++ b/packages/web/src/App.tsx @@ -17,6 +17,7 @@ import { PiX, PiRobot, PiUploadSimple, + PiVideoCamera, } from 'react-icons/pi'; import { Outlet } from 'react-router-dom'; import Drawer, { ItemProps } from './components/Drawer'; @@ -26,11 +27,14 @@ import useDrawer from './hooks/useDrawer'; import useConversation from './hooks/useConversation'; import PopupInterUseCasesDemo from './components/PopupInterUseCasesDemo'; import useInterUseCases from './hooks/useInterUseCases'; +import { MODELS } from './hooks/useModel'; const ragEnabled: boolean = import.meta.env.VITE_APP_RAG_ENABLED === 'true'; const agentEnabled: boolean = import.meta.env.VITE_APP_AGENT_ENABLED === 'true'; const recognizeFileEnabled: boolean = import.meta.env.VITE_APP_RECOGNIZE_FILE_ENABLED === 'true'; +const { multiModalModelIds } = MODELS; +const multiModalEnabled: boolean = multiModalModelIds.length > 0; const items: ItemProps[] = [ { @@ -103,6 +107,14 @@ const items: ItemProps[] = [ icon: , display: 'usecase' as const, }, + multiModalEnabled + ? { + label: '映像分析', + to: '/video', + icon: , + display: 'usecase' as const, + } + : null, { label: '音声認識', to: '/transcribe', diff --git a/packages/web/src/hooks/useFiles.ts b/packages/web/src/hooks/useFiles.ts index dbb0f48b..be0de309 100644 --- a/packages/web/src/hooks/useFiles.ts +++ b/packages/web/src/hooks/useFiles.ts @@ -3,7 +3,7 @@ import useFileApi from './useFileApi'; import { UploadedFileType } from 'generative-ai-use-cases-jp'; import { produce } from 'immer'; -const extractBaseURL = (url: string) => { +export const extractBaseURL = (url: string) => { return url.split(/[?#]/)[0]; }; const useFilesState = create<{ diff --git a/packages/web/src/main.tsx b/packages/web/src/main.tsx index 251f771d..65d7ab76 100644 --- a/packages/web/src/main.tsx +++ b/packages/web/src/main.tsx @@ -16,6 +16,7 @@ import SummarizePage from './pages/SummarizePage'; import GenerateTextPage from './pages/GenerateTextPage'; import EditorialPage from './pages/EditorialPage'; import TranslatePage from './pages/TranslatePage'; +import VideoAnalyzerPage from './pages/VideoAnalyzerPage'; import NotFound from './pages/NotFound'; import KendraSearchPage from './pages/KendraSearchPage'; import RagPage from './pages/RagPage'; @@ -24,6 +25,7 @@ import GenerateImagePage from './pages/GenerateImagePage'; import TranscribePage from './pages/TranscribePage'; import AgentChatPage from './pages/AgentChatPage.tsx'; import FileUploadPage from './pages/FileUploadPage.tsx'; +import { MODELS } from './hooks/useModel'; const ragEnabled: boolean = import.meta.env.VITE_APP_RAG_ENABLED === 'true'; const samlAuthEnabled: boolean = @@ -31,6 +33,8 @@ const samlAuthEnabled: boolean = const agentEnabled: boolean = import.meta.env.VITE_APP_AGENT_ENABLED === 'true'; const recognizeFileEnabled: boolean = import.meta.env.VITE_APP_RECOGNIZE_FILE_ENABLED === 'true'; +const { multiModalModelIds } = MODELS; +const multiModalEnabled: boolean = multiModalModelIds.length > 0; const routes: RouteObject[] = [ { @@ -81,6 +85,12 @@ const routes: RouteObject[] = [ path: '/transcribe', element: , }, + multiModalEnabled + ? { + path: '/video', + element: , + } + : null, recognizeFileEnabled ? { path: '/file', diff --git a/packages/web/src/pages/VideoAnalyzerPage.tsx b/packages/web/src/pages/VideoAnalyzerPage.tsx new file mode 100644 index 00000000..c240a69d --- /dev/null +++ b/packages/web/src/pages/VideoAnalyzerPage.tsx @@ -0,0 +1,337 @@ +import React, { + useCallback, + useEffect, + useState, + useRef, + useMemo, +} from 'react'; +import { useLocation } from 'react-router-dom'; +import useChat from '../hooks/useChat'; +import useTyping from '../hooks/useTyping'; +import useFileApi from '../hooks/useFileApi'; +import { UploadedFileType } from 'generative-ai-use-cases-jp'; +import { extractBaseURL } from '../hooks/useFiles'; +import { create } from 'zustand'; +import { getPrompter } from '../prompts'; +import { VideoAnalyzerPageQueryParams } from '../@types/navigate'; +import { MODELS } from '../hooks/useModel'; +import Button from '../components/Button'; +import Markdown from '../components/Markdown'; +import InputChatContent from '../components/InputChatContent'; +import Card from '../components/Card'; +import Select from '../components/Select'; +import queryString from 'query-string'; + +type StateType = { + content: string; + setContent: (c: string) => void; + analysis: string; + setAnalysis: (a: string) => void; + clear: () => void; +}; + +const useVideoAnalyzerPageState = create((set) => { + const INIT_STATE = { + content: '', + analysis: '', + }; + return { + ...INIT_STATE, + setContent: (c: string) => { + set(() => ({ + content: c, + })); + }, + setAnalysis: (a: string) => { + set(() => ({ + analysis: a, + })); + }, + clear: () => { + set(INIT_STATE); + }, + }; +}); + +const VideoAnalyzerPage: React.FC = () => { + const { content, setContent, analysis, setAnalysis, clear } = + useVideoAnalyzerPageState(); + const [mediaStream, setMediaStream] = useState(null); + const [recording, setRecording] = useState(false); + const [devices, setDevices] = useState<{ value: string; label: string }[]>( + [] + ); + const [deviceId, setDeviceId] = useState(''); + const [sending, setSending] = useState(false); + const videoElement = useRef(null); + const callbackRef = useRef<() => void>(); + const { getSignedUrl, uploadFile } = useFileApi(); + const { pathname, search } = useLocation(); + const { + getModelId, + setModelId, + loading, + messages, + postChat, + clear: clearChat, + } = useChat(pathname); + const { setTypingTextInput, typingTextOutput } = useTyping(loading); + const { modelIds: availableModels } = MODELS; + const availableMultiModalModels = useMemo(() => { + return availableModels.filter((modelId) => + MODELS.multiModalModelIds.includes(modelId) + ); + }, [availableModels]); + const modelId = getModelId(); + const prompter = useMemo(() => { + return getPrompter(modelId); + }, [modelId]); + + useEffect(() => { + const _modelId = !modelId ? availableMultiModalModels[0] : modelId; + if (search !== '') { + const params = queryString.parse(search) as VideoAnalyzerPageQueryParams; + setContent(params.content); + setModelId( + availableMultiModalModels.includes(params.modelId ?? '') + ? params.modelId! + : _modelId + ); + } else { + setModelId(_modelId); + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [setContent, modelId, availableMultiModalModels, search]); + + useEffect(() => { + setTypingTextInput(analysis); + }, [analysis, setTypingTextInput]); + + useEffect(() => { + const getDevices = async () => { + // 新規で画面を開いたユーザーにカメラの利用を要求する (ダミーのリクエスト) + const dummyStream = await navigator.mediaDevices.getUserMedia({ + audio: false, + video: true, + }); + + if (dummyStream) { + // 録画ボタンがついてしまうため消す + dummyStream.getTracks().forEach((track) => track.stop()); + + const devices = await navigator.mediaDevices.enumerateDevices(); + const videoDevices = devices + .filter((device) => device.kind === 'videoinput') + .map((device) => { + return { + value: device.deviceId, + label: device.label.replace(/\s\(.*?\)/g, ''), + }; + }); + setDevices(videoDevices); + } + }; + + getDevices(); + }, []); + + useEffect(() => { + if (deviceId.length === 0 && devices.length > 0) { + setDeviceId(devices[0].value); + } + }, [deviceId, devices]); + + useEffect(() => { + if (messages.length === 0) return; + const _lastMessage = messages[messages.length - 1]; + if (_lastMessage.role !== 'assistant') return; + const _response = messages[messages.length - 1].content; + setAnalysis(_response.trim()); + }, [messages, setAnalysis]); + + const onClickClear = useCallback(() => { + clear(); + clearChat(); + }, [clear, clearChat]); + + const sendFrame = useCallback(() => { + if (!videoElement.current) return; + + setSending(true); + + const canvas = document.createElement('canvas'); + canvas.width = videoElement.current.videoWidth; + canvas.height = videoElement.current.videoHeight; + const context = canvas.getContext('2d'); + context!.drawImage(videoElement.current, 0, 0, canvas.width, canvas.height); + // toDataURL() で返す値は以下の形式 (;base64, 以降のみを使う) + // ``` + // data:image/png;base64,<以下base64...> + // ``` + const imageBase64 = canvas.toDataURL('image/png').split(';base64,')[1]; + + canvas.toBlob(async (blob) => { + const file = new File([blob!], 'tmp.png', { type: 'image/png' }); + const signedUrl = (await getSignedUrl({ mediaFormat: 'png' })).data; + await uploadFile(signedUrl, { file }); + const baseUrl = extractBaseURL(signedUrl); + const uploadedFiles: UploadedFileType[] = [ + { + file, + s3Url: baseUrl, + base64EncodedImage: imageBase64, + uploading: false, + }, + ]; + + postChat( + prompter.videoAnalyzerPrompt({ + content, + }), + false, + undefined, + undefined, + undefined, + uploadedFiles + ); + + setSending(false); + }); + }, [prompter, content, postChat, getSignedUrl, uploadFile]); + + const startRecording = useCallback(async () => { + try { + if (videoElement.current) { + setRecording(true); + + const stream = await navigator.mediaDevices.getUserMedia({ + audio: false, + video: { + deviceId: { + exact: deviceId, + }, + }, + }); + videoElement.current.srcObject = stream; + videoElement.current.play(); + + setMediaStream(stream); + } + } catch (e) { + console.error('ウェブカメラにアクセスできませんでした:', e); + } + }, [setRecording, videoElement, deviceId]); + + // ビデオの停止 + const stopRecording = useCallback(() => { + if (mediaStream) { + mediaStream.getTracks().forEach((track) => track.stop()); + } + setRecording(false); + }, [mediaStream]); + + // Callback 関数を常に最新にしておく + useEffect(() => { + callbackRef.current = stopRecording; + }, [stopRecording]); + + // Unmount 時 (画面を離れた時) の処理 + useEffect(() => { + return () => { + if (callbackRef.current) { + callbackRef.current(); + callbackRef.current = undefined; + } + }; + }, []); + + return ( + + + 映像分析 + + + + + + + + + {recording ? ( + <> + + 停止 + + > + ) : ( + <> + + 開始 + + > + )} + + + + + + + + + { + return { value: m, label: m }; + })} + label="モデル" + /> + + + {typingTextOutput} + {(loading || sending) && ( + + )} + + + + クリア + + + + + + + + + + + + ); +}; + +export default VideoAnalyzerPage; diff --git a/packages/web/src/prompts/claude.ts b/packages/web/src/prompts/claude.ts index 55b26478..678c4482 100644 --- a/packages/web/src/prompts/claude.ts +++ b/packages/web/src/prompts/claude.ts @@ -8,6 +8,7 @@ import { SetTitleParams, SummarizeParams, TranslateParams, + VideoAnalyzerParams, WebContentParams, } from './index'; @@ -17,7 +18,8 @@ const systemContexts: { [key: string]: string } = { 'あなたは文章を要約するAIアシスタントです。最初のチャットで要約の指示を出すので、その後のチャットで要約結果の改善を行なってください。', '/editorial': 'あなたは丁寧に細かいところまで指摘する厳しい校閲担当者です。', '/generate': 'あなたは指示に従って文章を作成するライターです。', - '/translate': '以下は文章を翻訳したいユーザーと、ユーザーの意図と文章を理解して適切に翻訳する AI のやりとりです。ユーザーは タグで翻訳する文章と、 タグで翻訳先の言語を与えます。また、<考慮してほしいこと> タグで翻訳時に考慮してほしいことを与えることもあります。AI は <考慮してほしいこと> がある場合は考慮しつつ、 で与えるテキストを で与える言語に翻訳してください。出力は{翻訳結果}の形で翻訳した文章だけを出力してください。それ以外の文章は一切出力してはいけません。', + '/translate': + '以下は文章を翻訳したいユーザーと、ユーザーの意図と文章を理解して適切に翻訳する AI のやりとりです。ユーザーは タグで翻訳する文章と、 タグで翻訳先の言語を与えます。また、<考慮してほしいこと> タグで翻訳時に考慮してほしいことを与えることもあります。AI は <考慮してほしいこと> がある場合は考慮しつつ、 で与えるテキストを で与える言語に翻訳してください。出力は{翻訳結果}の形で翻訳した文章だけを出力してください。それ以外の文章は一切出力してはいけません。', '/web-content': 'あなたはHTMLからコンテンツを抽出する仕事に従事してます。', '/rag': '', '/image': `あなたはStable Diffusionのプロンプトを生成するAIアシスタントです。 @@ -64,6 +66,8 @@ const systemContexts: { [key: string]: string } = { 出力は必ず prompt キー、 negativePrompt キー, comment キー, recommendedStylePreset キーを包有した JSON 文字列だけで終えてください。それ以外の情報を出力してはいけません。もちろん挨拶や説明を前後に入れてはいけません。例外はありません。`, + '/video': + 'あなたは映像分析を支援するAIアシスタントです。これから映像のフレーム画像とユーザーの入力 を与えるので、 の指示に従って答えを出力してください。出力は{答え}の形で出力してください。それ以外の文章は一切出力してはいけません。また出力は {} で囲わないでください。', }; export const claudePrompter: Prompter = { @@ -233,6 +237,9 @@ ${params `; } }, + videoAnalyzerPrompt(params: VideoAnalyzerParams): string { + return `${params.content}`; + }, setTitlePrompt(params: SetTitleParams): string { return `以下はユーザーとAIアシスタントの会話です。まずはこちらを読み込んでください。${JSON.stringify( params.messages diff --git a/packages/web/src/prompts/index.ts b/packages/web/src/prompts/index.ts index eee95ad2..5100fd41 100644 --- a/packages/web/src/prompts/index.ts +++ b/packages/web/src/prompts/index.ts @@ -50,6 +50,10 @@ export type RagParams = { referenceItems?: RetrieveResultItem[]; }; +export type VideoAnalyzerParams = { + content: string; +}; + export type SetTitleParams = { messages: UnrecordedMessage[]; }; @@ -75,6 +79,7 @@ export interface Prompter { translatePrompt(params: TranslateParams): string; webContentPrompt(params: WebContentParams): string; ragPrompt(params: RagParams): string; + videoAnalyzerPrompt(params: VideoAnalyzerParams): string; setTitlePrompt(params: SetTitleParams): string; promptList(): PromptList; }