microsoft · vraspar · Oct 14, 2024 · Oct 1, 2024 · Oct 3, 2024 · Oct 9, 2024
diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/ContentView.swift b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/ContentView.swift
@@ -3,25 +3,140 @@
 
 import SwiftUI
 
+
+struct Message: Identifiable {
+    let id = UUID()
+    var text: String
+    let isUser: Bool
+}
+
 struct ContentView: View {
-    @ObservedObject var tokenUpdater = SharedTokenUpdater.shared
+    @State private var userInput: String = ""
+    @State private var messages: [Message] = []  // Store chat messages locally
+    @State private var isGenerating: Bool = false  // Track token generation state
+    @State private var stats: String = ""  // token genetation stats
+    @State private var showAlert: Bool = false
+    @State private var errorMessage: String = ""
 
     var body: some View {
         VStack {
+            // ChatBubbles
             ScrollView {
-                VStack(alignment: .leading) {
-                    ForEach(tokenUpdater.decodedTokens, id: \.self) { token in
-                        Text(token)
-                           .padding(.horizontal, 5)
+                VStack(alignment: .leading, spacing: 20) {
+                    ForEach(messages) { message in
+                        ChatBubble(text: message.text, isUser: message.isUser)
+                            .padding(.horizontal, 20)
+                    }
+                    if !stats.isEmpty {
+                        Text(stats)
+                            .font(.footnote)
+                            .foregroundColor(.gray)
+                            .padding(.horizontal, 20)
+                            .padding(.top, 5)
+                            .multilineTextAlignment(.center)
                     }
                 }
-                .padding()
+                .padding(.top, 20)
             }
-            Button("Generate Tokens") {
-                DispatchQueue.global(qos: .background).async {
-                    // TODO: add user prompt question UI
-                    GenAIGenerator.generate("Who is the current US president?");
+
+
+            // User input 
+            HStack {
+                TextField("Type your message...", text: $userInput)
+                    .padding()
+                    .background(Color(.systemGray6))
+                    .cornerRadius(20)
+                    .padding(.horizontal)
+
+                Button(action: {
+                    // Check for non-empty input
+                    guard !userInput.trimmingCharacters(in: .whitespaces).isEmpty else { return }
+
+                    messages.append(Message(text: userInput, isUser: true))
+                    messages.append(Message(text: "", isUser: false))  // Placeholder for AI response
+
+
+                    // clear previously generated tokens
+                    SharedTokenUpdater.shared.clearTokens()
+
+                    let prompt = userInput
+                    userInput = ""
+                    isGenerating = true
+
+
+                    DispatchQueue.global(qos: .background).async {
+                        GenAIGenerator.generate(prompt)
+                    }
+                }) {
+                    Image(systemName: "paperplane.fill")
+                        .foregroundColor(.white)
+                        .padding()
+                        .background(isGenerating ? Color.gray : Color.pastelGreen)
+                        .clipShape(Circle())
+                        .padding(.trailing, 10)
                 }
+                .disabled(isGenerating)
+            }
+            .padding(.bottom, 20)
+        }
+        .background(Color(.systemGroupedBackground))
+        .edgesIgnoringSafeArea(.bottom)
+        .onReceive(NotificationCenter.default.publisher(for: NSNotification.Name("TokenGenerationCompleted"))) { _ in
+            isGenerating = false  // Re-enable the button when token generation is complete
+        }
+        .onReceive(SharedTokenUpdater.shared.$decodedTokens) { tokens in
+            // update model response
+            if let lastIndex = messages.lastIndex(where: { !$0.isUser }) {
+                let combinedText = tokens.joined(separator: "")
+                messages[lastIndex].text = combinedText
+            }
+        }
+        .onReceive(NotificationCenter.default.publisher(for: NSNotification.Name("TokenGenerationStats"))) { notification in
+            if let userInfo = notification.userInfo,
+               let promptProcRate = userInfo["promptProcRate"] as? Double,
+               let tokenGenRate = userInfo["tokenGenRate"] as? Double {
+                stats = String(format: "Token generation rate: %.2f tokens/s. Prompt processing rate: %.2f tokens/s", tokenGenRate, promptProcRate)
+            }
+        }
+        .onReceive(NotificationCenter.default.publisher(for: NSNotification.Name("TokenGenerationError"))) { notification in
+            if let userInfo = notification.userInfo, let error = userInfo["error"] as? String {
+                    errorMessage = error
+                    showAlert = true
+            }
+        }
+        .alert(isPresented: $showAlert) {
+            Alert(
+                title: Text("Error"),
+                message: Text(errorMessage),
+                dismissButton: .default(Text("OK"))
+            )
+        }
+
+    }
+}
+
+struct ChatBubble: View {
+    var text: String
+    var isUser: Bool
+
+    var body: some View {
+        HStack {
+            if isUser {
+                Spacer()
+                Text(text)
+                    .padding()
+                    .background(Color.pastelGreen)
+                    .foregroundColor(.white)
+                    .cornerRadius(25)
+                    .padding(.horizontal, 10)
+            } else {
+                Text(text)
+                    .padding()
+                    .background(Color(.systemGray5))
+                    .foregroundColor(.black)
+                    .cornerRadius(25)
+                    .padding(.horizontal, 10)
+                Spacer()
             }
         }
     }
@@ -32,3 +147,8 @@ struct ContentView_Previews: PreviewProvider {
         ContentView()
     }
 }
+
+// Extension for a pastel green color
+extension Color {
+    static let pastelGreen = Color(red: 0.6, green: 0.9, blue: 0.6)
+}
diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/GenAIGenerator.mm b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/GenAIGenerator.mm
@@ -1,49 +1,143 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-
 #import "GenAIGenerator.h"
 #include "LocalLLM-Swift.h"
 #include "ort_genai.h"
 #include "ort_genai_c.h"
-
+#include <chrono>
+#include <vector>
 
 @implementation GenAIGenerator
 
+typedef std::chrono::steady_clock Clock;
+typedef std::chrono::time_point<Clock> TimePoint;
+static std::unique_ptr<OgaModel> model = nullptr;
+static std::unique_ptr<OgaTokenizer> tokenizer = nullptr;
+
 + (void)generate:(nonnull NSString*)input_user_question {
-  NSString* llmPath = [[NSBundle mainBundle] resourcePath];
-  const char* modelPath = llmPath.cString;
+    std::vector<long long> tokenTimes; // per-token generation times
+    TimePoint startTime, firstTokenTime, tokenStartTime;
+
+    @try {
+        NSLog(@"Starting token generation...");
+
+        if (!model) {
+            NSLog(@"Creating model...");
+            NSString* llmPath = [[NSBundle mainBundle] resourcePath];
+            const char* modelPath = llmPath.cString;
+            model = OgaModel::Create(modelPath); // throws exception
+
+            if (!model) {
+                @throw [NSException exceptionWithName:@"ModelCreationError" reason:@"Failed to create model." userInfo:nil];
+            }
+        }
+
+        if (!tokenizer) {
+            NSLog(@"Creating tokenizer...");
+            tokenizer = OgaTokenizer::Create(*model);  // throws exception
+            if (!tokenizer) {
+                @throw [NSException exceptionWithName:@"TokenizerCreationError" reason:@"Failed to create tokenizer." userInfo:nil];
+            }
+        }
+
+        auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
+
+        // Construct the prompt
+        NSString* promptString = [NSString stringWithFormat:@"<|user|>\n%@<|end|>\n<|assistant|>", input_user_question];
+        const char* prompt = [promptString UTF8String];
+
+        // Encode the prompt
+        auto sequences = OgaSequences::Create();
+        tokenizer->Encode(prompt, *sequences);
 
-  auto model = OgaModel::Create(modelPath);
-  auto tokenizer = OgaTokenizer::Create(*model);
+        size_t promptTokensCount = sequences->SequenceCount(0); 
 
-  NSString* promptString = [NSString stringWithFormat:@"<|user|>\n%@<|end|>\n<|assistant|>", input_user_question];
-  const char* prompt = [promptString UTF8String];
+        NSLog(@"Setting generator parameters...");
+        auto params = OgaGeneratorParams::Create(*model);
+        params->SetSearchOption("max_length", 200);
+        params->SetInputSequences(*sequences);
 
-  auto sequences = OgaSequences::Create();
-  tokenizer->Encode(prompt, *sequences);
+        auto generator = OgaGenerator::Create(*model, *params);
 
-  auto params = OgaGeneratorParams::Create(*model);
-  params->SetSearchOption("max_length", 200);
-  params->SetInputSequences(*sequences);
+        bool isFirstToken = true;
+        NSLog(@"Starting token generation loop...");
+
+        startTime = Clock::now();
+        while (!generator->IsDone()) {
+            tokenStartTime = Clock::now();
 
-  // Streaming Output to generate token by token
-  auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
+            generator->ComputeLogits();
+            generator->GenerateNextToken();
 
-  auto generator = OgaGenerator::Create(*model, *params);
+            if (isFirstToken) {
+                firstTokenTime = Clock::now();
+                isFirstToken = false;
+            }
 
-  while (!generator->IsDone()) {
-    generator->ComputeLogits();
-    generator->GenerateNextToken();
+            // Get the sequence data and decode the token
+            const int32_t* seq = generator->GetSequenceData(0);
+            size_t seq_len = generator->GetSequenceCount(0);
+            const char* decode_tokens = tokenizer_stream->Decode(seq[seq_len - 1]);
 
-    const int32_t* seq = generator->GetSequenceData(0);
-    size_t seq_len = generator->GetSequenceCount(0);
-    const char* decode_tokens = tokenizer_stream->Decode(seq[seq_len - 1]);
+            if (!decode_tokens) {
+                @throw [NSException exceptionWithName:@"TokenDecodeError" reason:@"Token decoding failed." userInfo:nil];
+            }
 
-    NSLog(@"Decoded tokens: %s", decode_tokens);
+            // Measure token generation time excluding logging
+            TimePoint tokenEndTime = Clock::now();
+            auto tokenDuration = std::chrono::duration_cast<std::chrono::milliseconds>(tokenEndTime - tokenStartTime).count();
+            tokenTimes.push_back(tokenDuration);
+            NSString* decodedTokenString = [NSString stringWithUTF8String:decode_tokens];
+            [SharedTokenUpdater.shared addDecodedToken:decodedTokenString];
+        }
 
-    // Add decoded token to SharedTokenUpdater
-    NSString* decodedTokenString = [NSString stringWithUTF8String:decode_tokens];
-    [SharedTokenUpdater.shared addDecodedToken:decodedTokenString];
-  }
+        TimePoint endTime = Clock::now();
+        // Log token times
+        NSLog(@"Per-token generation times: %@", [self formatTokenTimes:tokenTimes]);
+
+        // Calculate metrics
+        auto totalDuration = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime).count();
+        auto firstTokenDuration = std::chrono::duration_cast<std::chrono::milliseconds>(firstTokenTime - startTime).count();
+
+        double promtProcTime =  (double)promptTokensCount / firstTokenDuration;
+        double tokenGenRate = (double)(tokenTimes.size() - 1) * 1000.0 / (totalDuration - firstTokenDuration);
+
+        NSLog(@"Token generation completed. Total time: %lld ms, First token time: %lld ms, Total tokens: %zu", totalDuration, firstTokenDuration, tokenTimes.size());
+        NSLog(@"Prompt tokens: %zu, Prompt Processing Time: %f tokens/s", promptTokensCount, promtProcTime);
+        NSLog(@"Generated tokens: %zu, Token Generation Rate: %f tokens/s", tokenTimes.size(), tokenGenRate);
+
+
+        NSDictionary *stats = @{
+            @"tokenGenRate" : @(tokenGenRate),
+            @"promptProcRate": @(promtProcTime)
+        };
+        // notify main thread that token generation is complete 
+        dispatch_async(dispatch_get_main_queue(), ^{
+            [[NSNotificationCenter defaultCenter] postNotificationName:@"TokenGenerationStats" object:nil userInfo:stats];
+            [[NSNotificationCenter defaultCenter] postNotificationName:@"TokenGenerationCompleted" object:nil];
+        });
+
+        NSLog(@"Token generation completed.");
+
+    } @catch (NSException* e) {
+        NSString* errorMessage = e.reason;
+        NSLog(@"Error during generation: %@", errorMessage);
+
+        // Send error to the UI
+        NSDictionary *errorInfo = @{@"error": errorMessage};
+        dispatch_async(dispatch_get_main_queue(), ^{
+            [[NSNotificationCenter defaultCenter] postNotificationName:@"TokenGenerationError" object:nil userInfo:errorInfo];
+        });
+    }
 }
+
+// Utility function to format token times for logging
++ (NSString*)formatTokenTimes:(const std::vector<long long>&)tokenTimes {
+    NSMutableString *formattedTimes = [NSMutableString string];
+    for (size_t i = 0; i < tokenTimes.size(); i++) {
+        [formattedTimes appendFormat:@"%lld ms, ", tokenTimes[i]];
+    }
+    return [formattedTimes copy];
+}
+
 @end
diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/README.md b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/README.md
@@ -65,7 +65,7 @@ git clone https://github.com/microsoft/onnxruntime-genai
 
 cd onnxruntime-genai
 
-python3 build.py --parallel --build_dir ./build_iphoneos --ios --ios_sysroot iphoneos --ios_arch arm64 --ios_deployment_target 16.6 --cmake_generator Xcode
+python3 build.py --parallel --build_dir ./build_iphoneos --ios --apple_sysroot iphoneos --osx_arch arm64 --apple_deploy_target 16.6 --cmake_generator Xcode
 
 ```
 
@@ -98,12 +98,14 @@ The app uses Objective-C/C++ since using Generative AI with ONNX Runtime C++ API
 
 Download from hf repo: <https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx/tree/main/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4>
 
-After downloading completes, you need to copy files over to the `Resources` directory in the `Destination` column of `Target-LocalLLM`->`Build Phases`-> `New Copy File Phases` -> `Copy Files`.
+After downloading the files, Click on `LocalLLM` project from sidebar, go to `Targets > LocalLLM > Build Phases`. Find the Copy Files section, set the Destination to Resources, and add the downloaded files.
 
 Upon app launching, Xcode will automatically copy and install the model files from Resources folder and directly download to the iOS device.
 
 ### 4. Run the app and checkout the streaming output token results
 
 **Note**: The current app only sets up with a simple initial prompt question, you can adjust/try your own or refine the UI based on requirements.
 
-***Notice:*** The current Xcode project runs on iOS 16.6, feel free to adjust latest iOS/build for lates iOS versions accordingly.
+***Notice:*** The current Xcode project runs on iOS 16.6, feel free to adjust latest iOS/build for lates iOS versions accordingly.
+
+![alt text](<Simulator Screenshot - iPhone 16.png>)
diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/SharedTokenUpdater.swift b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/SharedTokenUpdater.swift
@@ -14,4 +14,10 @@ import Foundation
             self.decodedTokens.append(token)
         }
     }
+
+    @objc func clearTokens() {
+        DispatchQueue.main.async {
+            self.decodedTokens.removeAll()
+        }
+    }
 }
diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/Simulator Screenshot - iPhone 16.png b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/Simulator Screenshot - iPhone 16.png