diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/ContentView.swift b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/ContentView.swift index fe18b2af..0777e2ff 100644 --- a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/ContentView.swift +++ b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/ContentView.swift @@ -3,25 +3,143 @@ import SwiftUI + +struct Message: Identifiable { + let id = UUID() + var text: String + let isUser: Bool +} + struct ContentView: View { - @ObservedObject var tokenUpdater = SharedTokenUpdater.shared + @State private var userInput: String = "" + @State private var messages: [Message] = [] // Store chat messages locally + @State private var isGenerating: Bool = false // Track token generation state + @State private var stats: String = "" // token generation stats + @State private var showAlert: Bool = false + @State private var errorMessage: String = "" + + private let generator = GenAIGenerator() var body: some View { VStack { + // ChatBubbles ScrollView { - VStack(alignment: .leading) { - ForEach(tokenUpdater.decodedTokens, id: \.self) { token in - Text(token) - .padding(.horizontal, 5) + VStack(alignment: .leading, spacing: 20) { + ForEach(messages) { message in + ChatBubble(text: message.text, isUser: message.isUser) + .padding(.horizontal, 20) + } + if !stats.isEmpty { + Text(stats) + .font(.footnote) + .foregroundColor(.gray) + .padding(.horizontal, 20) + .padding(.top, 5) + .multilineTextAlignment(.center) } } - .padding() + .padding(.top, 20) } - Button("Generate Tokens") { - DispatchQueue.global(qos: .background).async { - // TODO: add user prompt question UI - GenAIGenerator.generate("Who is the current US president?"); + + + // User input + HStack { + TextField("Type your message...", text: $userInput) + .padding() + .background(Color(.systemGray6)) + .cornerRadius(20) + .padding(.horizontal) + + Button(action: { + // Check for non-empty input + guard !userInput.trimmingCharacters(in: .whitespaces).isEmpty else { return } + + messages.append(Message(text: userInput, isUser: true)) + messages.append(Message(text: "", isUser: false)) // Placeholder for AI response + + + // clear previously generated tokens + SharedTokenUpdater.shared.clearTokens() + + let prompt = userInput + userInput = "" + isGenerating = true + + + DispatchQueue.global(qos: .background).async { + generator.generate(prompt) + } + }) { + Image(systemName: "paperplane.fill") + .foregroundColor(.white) + .padding() + .background(isGenerating ? Color.gray : Color.pastelGreen) + .clipShape(Circle()) + .padding(.trailing, 10) } + .disabled(isGenerating) + } + .padding(.bottom, 20) + } + .background(Color(.systemGroupedBackground)) + .edgesIgnoringSafeArea(.bottom) + .onReceive(NotificationCenter.default.publisher(for: NSNotification.Name("TokenGenerationCompleted"))) { _ in + isGenerating = false // Re-enable the button when token generation is complete + } + .onReceive(SharedTokenUpdater.shared.$decodedTokens) { tokens in + // update model response + if let lastIndex = messages.lastIndex(where: { !$0.isUser }) { + let combinedText = tokens.joined(separator: "") + messages[lastIndex].text = combinedText + } + } + .onReceive(NotificationCenter.default.publisher(for: NSNotification.Name("TokenGenerationStats"))) { notification in + if let userInfo = notification.userInfo, + let promptProcRate = userInfo["promptProcRate"] as? Double, + let tokenGenRate = userInfo["tokenGenRate"] as? Double { + stats = String(format: "Token generation rate: %.2f tokens/s. Prompt processing rate: %.2f tokens/s", tokenGenRate, promptProcRate) + } + } + .onReceive(NotificationCenter.default.publisher(for: NSNotification.Name("TokenGenerationError"))) { notification in + if let userInfo = notification.userInfo, let error = userInfo["error"] as? String { + errorMessage = error + isGenerating = false + showAlert = true + } + } + .alert(isPresented: $showAlert) { + Alert( + title: Text("Error"), + message: Text(errorMessage), + dismissButton: .default(Text("OK")) + ) + } + + } +} + +struct ChatBubble: View { + var text: String + var isUser: Bool + + var body: some View { + HStack { + if isUser { + Spacer() + Text(text) + .padding() + .background(Color.pastelGreen) + .foregroundColor(.white) + .cornerRadius(25) + .padding(.horizontal, 10) + } else { + Text(text) + .padding() + .background(Color(.systemGray5)) + .foregroundColor(.black) + .cornerRadius(25) + .padding(.horizontal, 10) + Spacer() } } } @@ -32,3 +150,8 @@ struct ContentView_Previews: PreviewProvider { ContentView() } } + +// Extension for a pastel green color +extension Color { + static let pastelGreen = Color(red: 0.6, green: 0.9, blue: 0.6) +} diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/GenAIGenerator.h b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/GenAIGenerator.h index 5404000f..288c914d 100644 --- a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/GenAIGenerator.h +++ b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/GenAIGenerator.h @@ -11,7 +11,7 @@ NS_ASSUME_NONNULL_BEGIN @interface GenAIGenerator : NSObject -+ (void)generate:(NSString *)input_user_question; +- (void)generate:(NSString *)input_user_question; @end diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/GenAIGenerator.mm b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/GenAIGenerator.mm index d430f16a..ddcf2b10 100644 --- a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/GenAIGenerator.mm +++ b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/GenAIGenerator.mm @@ -1,49 +1,151 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. - #import "GenAIGenerator.h" +#include +#include #include "LocalLLM-Swift.h" #include "ort_genai.h" #include "ort_genai_c.h" +const size_t kMaxTokens = 200; + +@interface GenAIGenerator () { + std::unique_ptr model; + std::unique_ptr tokenizer; +} +@end + @implementation GenAIGenerator -+ (void)generate:(nonnull NSString*)input_user_question { - NSString* llmPath = [[NSBundle mainBundle] resourcePath]; - const char* modelPath = llmPath.cString; +typedef std::chrono::steady_clock Clock; +typedef std::chrono::time_point TimePoint; + +- (instancetype)init { + self = [super init]; + if (self) { + self->model = nullptr; + self->tokenizer = nullptr; + } + return self; +} + +- (void)generate:(nonnull NSString*)input_user_question { + std::vector tokenTimes; // per-token generation times + tokenTimes.reserve(kMaxTokens); + + TimePoint startTime, firstTokenTime, tokenStartTime; + + try { + NSLog(@"Starting token generation..."); + + if (!self->model) { + NSLog(@"Creating model..."); + NSString* llmPath = [[NSBundle mainBundle] resourcePath]; + const char* modelPath = llmPath.cString; + self->model = OgaModel::Create(modelPath); // throws exception + } + + if (!self->tokenizer) { + NSLog(@"Creating tokenizer..."); + self->tokenizer = OgaTokenizer::Create(*self->model); // throws exception + } + + auto tokenizer_stream = OgaTokenizerStream::Create(*self->tokenizer); - auto model = OgaModel::Create(modelPath); - auto tokenizer = OgaTokenizer::Create(*model); + // Construct the prompt + NSString* promptString = [NSString stringWithFormat:@"<|user|>\n%@<|end|>\n<|assistant|>", input_user_question]; + const char* prompt = [promptString UTF8String]; - NSString* promptString = [NSString stringWithFormat:@"<|user|>\n%@<|end|>\n<|assistant|>", input_user_question]; - const char* prompt = [promptString UTF8String]; + // Encode the prompt + auto sequences = OgaSequences::Create(); + self->tokenizer->Encode(prompt, *sequences); - auto sequences = OgaSequences::Create(); - tokenizer->Encode(prompt, *sequences); + size_t promptTokensCount = sequences->SequenceCount(0); - auto params = OgaGeneratorParams::Create(*model); - params->SetSearchOption("max_length", 200); - params->SetInputSequences(*sequences); + NSLog(@"Setting generator parameters..."); + auto params = OgaGeneratorParams::Create(*self->model); + params->SetSearchOption("max_length", kMaxTokens); + params->SetInputSequences(*sequences); - // Streaming Output to generate token by token - auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer); + auto generator = OgaGenerator::Create(*self->model, *params); - auto generator = OgaGenerator::Create(*model, *params); + bool isFirstToken = true; + NSLog(@"Starting token generation loop..."); - while (!generator->IsDone()) { - generator->ComputeLogits(); - generator->GenerateNextToken(); + startTime = Clock::now(); + while (!generator->IsDone()) { + tokenStartTime = Clock::now(); - const int32_t* seq = generator->GetSequenceData(0); - size_t seq_len = generator->GetSequenceCount(0); - const char* decode_tokens = tokenizer_stream->Decode(seq[seq_len - 1]); + generator->ComputeLogits(); + generator->GenerateNextToken(); - NSLog(@"Decoded tokens: %s", decode_tokens); + if (isFirstToken) { + firstTokenTime = Clock::now(); + isFirstToken = false; + } - // Add decoded token to SharedTokenUpdater - NSString* decodedTokenString = [NSString stringWithUTF8String:decode_tokens]; - [SharedTokenUpdater.shared addDecodedToken:decodedTokenString]; + // Get the sequence data and decode the token + const int32_t* seq = generator->GetSequenceData(0); + size_t seq_len = generator->GetSequenceCount(0); + const char* decode_tokens = tokenizer_stream->Decode(seq[seq_len - 1]); + + if (!decode_tokens) { + throw std::runtime_error("Token decoding failed."); + } + + // Measure token generation time excluding logging + TimePoint tokenEndTime = Clock::now(); + auto tokenDuration = std::chrono::duration_cast(tokenEndTime - tokenStartTime).count(); + tokenTimes.push_back(tokenDuration); + NSString* decodedTokenString = [NSString stringWithUTF8String:decode_tokens]; + [SharedTokenUpdater.shared addDecodedToken:decodedTokenString]; + } + + TimePoint endTime = Clock::now(); + // Log token times + NSLog(@"Per-token generation times: %@", [self formatTokenTimes:tokenTimes]); + + // Calculate metrics + auto totalDuration = std::chrono::duration_cast(endTime - startTime).count(); + auto firstTokenDuration = std::chrono::duration_cast(firstTokenTime - startTime).count(); + + double promptProcRate = (double)promptTokensCount * 1000.0 / firstTokenDuration; + double tokenGenRate = (double)(tokenTimes.size() - 1) * 1000.0 / (totalDuration - firstTokenDuration); + + NSLog(@"Token generation completed. Total time: %lld ms, First token time: %lld ms, Total tokens: %zu", + totalDuration, firstTokenDuration, tokenTimes.size()); + NSLog(@"Prompt tokens: %zu, Prompt Processing Rate: %f tokens/s", promptTokensCount, promptProcRate); + NSLog(@"Generated tokens: %zu, Token Generation Rate: %f tokens/s", tokenTimes.size(), tokenGenRate); + + NSDictionary* stats = @{@"tokenGenRate" : @(tokenGenRate), @"promptProcRate" : @(promptProcRate)}; + // notify main thread that token generation is complete + dispatch_async(dispatch_get_main_queue(), ^{ + [[NSNotificationCenter defaultCenter] postNotificationName:@"TokenGenerationStats" object:nil userInfo:stats]; + [[NSNotificationCenter defaultCenter] postNotificationName:@"TokenGenerationCompleted" object:nil]; + }); + + NSLog(@"Token generation completed."); + + } catch (const std::exception& e) { + NSString* errorMessage = [NSString stringWithUTF8String:e.what()]; + NSLog(@"Error during generation: %@", errorMessage); + + // Send error to the UI + NSDictionary* errorInfo = @{@"error" : errorMessage}; + dispatch_async(dispatch_get_main_queue(), ^{ + [[NSNotificationCenter defaultCenter] postNotificationName:@"TokenGenerationError" object:nil userInfo:errorInfo]; + }); } } + +// Utility function to format token times for logging +- (NSString*)formatTokenTimes:(const std::vector&)tokenTimes { + NSMutableString* formattedTimes = [NSMutableString string]; + for (size_t i = 0; i < tokenTimes.size(); i++) { + [formattedTimes appendFormat:@"%lld ms, ", tokenTimes[i]]; + } + return [formattedTimes copy]; +} + @end diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/README.md b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/README.md index 8e53b58f..a9438224 100644 --- a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/README.md +++ b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/README.md @@ -65,7 +65,7 @@ git clone https://github.com/microsoft/onnxruntime-genai cd onnxruntime-genai -python3 build.py --parallel --build_dir ./build_iphoneos --ios --ios_sysroot iphoneos --ios_arch arm64 --ios_deployment_target 16.6 --cmake_generator Xcode +python3 build.py --parallel --build_dir ./build_iphoneos --ios --apple_sysroot iphoneos --osx_arch arm64 --apple_deploy_target 16.6 --cmake_generator Xcode ``` @@ -98,7 +98,7 @@ The app uses Objective-C/C++ since using Generative AI with ONNX Runtime C++ API Download from hf repo: -After downloading completes, you need to copy files over to the `Resources` directory in the `Destination` column of `Target-LocalLLM`->`Build Phases`-> `New Copy File Phases` -> `Copy Files`. +After downloading the files, Click on `LocalLLM` project from sidebar, go to `Targets > LocalLLM > Build Phases`. Find the Copy Files section, set the Destination to Resources, and add the downloaded files. Upon app launching, Xcode will automatically copy and install the model files from Resources folder and directly download to the iOS device. @@ -106,4 +106,6 @@ Upon app launching, Xcode will automatically copy and install the model files fr **Note**: The current app only sets up with a simple initial prompt question, you can adjust/try your own or refine the UI based on requirements. -***Notice:*** The current Xcode project runs on iOS 16.6, feel free to adjust latest iOS/build for lates iOS versions accordingly. \ No newline at end of file +***Notice:*** The current Xcode project runs on iOS 16.6, feel free to adjust latest iOS/build for lates iOS versions accordingly. + +![alt text]() \ No newline at end of file diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/SharedTokenUpdater.swift b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/SharedTokenUpdater.swift index a4680041..260a9154 100644 --- a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/SharedTokenUpdater.swift +++ b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/SharedTokenUpdater.swift @@ -14,4 +14,10 @@ import Foundation self.decodedTokens.append(token) } } + + @objc func clearTokens() { + DispatchQueue.main.async { + self.decodedTokens.removeAll() + } + } } diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/Simulator Screenshot - iPhone 16.png b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/Simulator Screenshot - iPhone 16.png new file mode 100644 index 00000000..c68e5a2b Binary files /dev/null and b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/Simulator Screenshot - iPhone 16.png differ