microsoft · vraspar · Oct 14, 2024 · Oct 1, 2024 · Oct 3, 2024 · Oct 9, 2024
diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/ContentView.swift b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/ContentView.swift
@@ -3,25 +3,143 @@
 
 import SwiftUI
 
+
+struct Message: Identifiable {
+    let id = UUID()
+    var text: String
+    let isUser: Bool
+}
+
 struct ContentView: View {
-    @ObservedObject var tokenUpdater = SharedTokenUpdater.shared
+    @State private var userInput: String = ""
+    @State private var messages: [Message] = []  // Store chat messages locally
+    @State private var isGenerating: Bool = false  // Track token generation state
+    @State private var stats: String = ""  // token generation stats
+    @State private var showAlert: Bool = false
+    @State private var errorMessage: String = ""
+
+    private let generator = GenAIGenerator()
 
     var body: some View {
         VStack {
+            // ChatBubbles
             ScrollView {
-                VStack(alignment: .leading) {
-                    ForEach(tokenUpdater.decodedTokens, id: \.self) { token in
-                        Text(token)
-                           .padding(.horizontal, 5)
+                VStack(alignment: .leading, spacing: 20) {
+                    ForEach(messages) { message in
+                        ChatBubble(text: message.text, isUser: message.isUser)
+                            .padding(.horizontal, 20)
+                    }
+                    if !stats.isEmpty {
+                        Text(stats)
+                            .font(.footnote)
+                            .foregroundColor(.gray)
+                            .padding(.horizontal, 20)
+                            .padding(.top, 5)
+                            .multilineTextAlignment(.center)
                     }
                 }
-                .padding()
+                .padding(.top, 20)
             }
-            Button("Generate Tokens") {
-                DispatchQueue.global(qos: .background).async {
-                    // TODO: add user prompt question UI
-                    GenAIGenerator.generate("Who is the current US president?");
+
+
+            // User input 
+            HStack {
+                TextField("Type your message...", text: $userInput)
+                    .padding()
+                    .background(Color(.systemGray6))
+                    .cornerRadius(20)
+                    .padding(.horizontal)
+
+                Button(action: {
+                    // Check for non-empty input
+                    guard !userInput.trimmingCharacters(in: .whitespaces).isEmpty else { return }
+
+                    messages.append(Message(text: userInput, isUser: true))
+                    messages.append(Message(text: "", isUser: false))  // Placeholder for AI response
+
+
+                    // clear previously generated tokens
+                    SharedTokenUpdater.shared.clearTokens()
+
+                    let prompt = userInput
+                    userInput = ""
+                    isGenerating = true
+
+
+                    DispatchQueue.global(qos: .background).async {
+                        generator.generate(prompt)
+                    }
+                }) {
+                    Image(systemName: "paperplane.fill")
+                        .foregroundColor(.white)
+                        .padding()
+                        .background(isGenerating ? Color.gray : Color.pastelGreen)
+                        .clipShape(Circle())
+                        .padding(.trailing, 10)
                 }
+                .disabled(isGenerating)
+            }
+            .padding(.bottom, 20)
+        }
+        .background(Color(.systemGroupedBackground))
+        .edgesIgnoringSafeArea(.bottom)
+        .onReceive(NotificationCenter.default.publisher(for: NSNotification.Name("TokenGenerationCompleted"))) { _ in
+            isGenerating = false  // Re-enable the button when token generation is complete
+        }
+        .onReceive(SharedTokenUpdater.shared.$decodedTokens) { tokens in
+            // update model response
+            if let lastIndex = messages.lastIndex(where: { !$0.isUser }) {
+                let combinedText = tokens.joined(separator: "")
+                messages[lastIndex].text = combinedText
+            }
+        }
+        .onReceive(NotificationCenter.default.publisher(for: NSNotification.Name("TokenGenerationStats"))) { notification in
+            if let userInfo = notification.userInfo,
+               let promptProcRate = userInfo["promptProcRate"] as? Double,
+               let tokenGenRate = userInfo["tokenGenRate"] as? Double {
+                stats = String(format: "Token generation rate: %.2f tokens/s. Prompt processing rate: %.2f tokens/s", tokenGenRate, promptProcRate)
+            }
+        }
+        .onReceive(NotificationCenter.default.publisher(for: NSNotification.Name("TokenGenerationError"))) { notification in
+            if let userInfo = notification.userInfo, let error = userInfo["error"] as? String {
+                    errorMessage = error
+                    isGenerating = false
+                    showAlert = true
+            }
+        }
+        .alert(isPresented: $showAlert) {
+            Alert(
+                title: Text("Error"),
+                message: Text(errorMessage),
+                dismissButton: .default(Text("OK"))
+            )
+        }
+
+    }
+}
+
+struct ChatBubble: View {
+    var text: String
+    var isUser: Bool
+
+    var body: some View {
+        HStack {
+            if isUser {
+                Spacer()
+                Text(text)
+                    .padding()
+                    .background(Color.pastelGreen)
+                    .foregroundColor(.white)
+                    .cornerRadius(25)
+                    .padding(.horizontal, 10)
+            } else {
+                Text(text)
+                    .padding()
+                    .background(Color(.systemGray5))
+                    .foregroundColor(.black)
+                    .cornerRadius(25)
+                    .padding(.horizontal, 10)
+                Spacer()
             }
         }
     }
@@ -32,3 +150,8 @@ struct ContentView_Previews: PreviewProvider {
         ContentView()
     }
 }
+
+// Extension for a pastel green color
+extension Color {
+    static let pastelGreen = Color(red: 0.6, green: 0.9, blue: 0.6)
+}
diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/GenAIGenerator.h b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/GenAIGenerator.h
@@ -11,7 +11,7 @@ NS_ASSUME_NONNULL_BEGIN
 
 @interface GenAIGenerator : NSObject
 
-+ (void)generate:(NSString *)input_user_question;
+- (void)generate:(NSString *)input_user_question;
 
 @end
 

diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/GenAIGenerator.mm b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/GenAIGenerator.mm
@@ -1,49 +1,151 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-
 #import "GenAIGenerator.h"
+#include <chrono>
+#include <vector>
 #include "LocalLLM-Swift.h"
 #include "ort_genai.h"
 #include "ort_genai_c.h"
 
 
+const size_t kMaxTokens = 200;
+
+@interface GenAIGenerator () {
+  std::unique_ptr<OgaModel> model;
+  std::unique_ptr<OgaTokenizer> tokenizer;
+}
+@end
+
 @implementation GenAIGenerator
 
-+ (void)generate:(nonnull NSString*)input_user_question {
-  NSString* llmPath = [[NSBundle mainBundle] resourcePath];
-  const char* modelPath = llmPath.cString;
+typedef std::chrono::steady_clock Clock;
+typedef std::chrono::time_point<Clock> TimePoint;
+
+- (instancetype)init {
+  self = [super init];
+  if (self) {
+    self->model = nullptr;
+    self->tokenizer = nullptr;
+  }
+  return self;
+}
+
+- (void)generate:(nonnull NSString*)input_user_question {
+  std::vector<long long> tokenTimes;  // per-token generation times
+  tokenTimes.reserve(kMaxTokens);
+
+  TimePoint startTime, firstTokenTime, tokenStartTime;
+
+  try {
+    NSLog(@"Starting token generation...");
+
+    if (!self->model) {
+      NSLog(@"Creating model...");
+      NSString* llmPath = [[NSBundle mainBundle] resourcePath];
+      const char* modelPath = llmPath.cString;
+      self->model = OgaModel::Create(modelPath);  // throws exception
+    }
+
+    if (!self->tokenizer) {
+      NSLog(@"Creating tokenizer...");
+      self->tokenizer = OgaTokenizer::Create(*self->model);  // throws exception
+    }
+
+    auto tokenizer_stream = OgaTokenizerStream::Create(*self->tokenizer);
 
-  auto model = OgaModel::Create(modelPath);
-  auto tokenizer = OgaTokenizer::Create(*model);
+    // Construct the prompt
+    NSString* promptString = [NSString stringWithFormat:@"<|user|>\n%@<|end|>\n<|assistant|>", input_user_question];
+    const char* prompt = [promptString UTF8String];
 
-  NSString* promptString = [NSString stringWithFormat:@"<|user|>\n%@<|end|>\n<|assistant|>", input_user_question];
-  const char* prompt = [promptString UTF8String];
+    // Encode the prompt
+    auto sequences = OgaSequences::Create();
+    self->tokenizer->Encode(prompt, *sequences);
 
-  auto sequences = OgaSequences::Create();
-  tokenizer->Encode(prompt, *sequences);
+    size_t promptTokensCount = sequences->SequenceCount(0);
 
-  auto params = OgaGeneratorParams::Create(*model);
-  params->SetSearchOption("max_length", 200);
-  params->SetInputSequences(*sequences);
+    NSLog(@"Setting generator parameters...");
+    auto params = OgaGeneratorParams::Create(*self->model);
+    params->SetSearchOption("max_length", kMaxTokens);
+    params->SetInputSequences(*sequences);
 
-  // Streaming Output to generate token by token
-  auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
+    auto generator = OgaGenerator::Create(*self->model, *params);
 
-  auto generator = OgaGenerator::Create(*model, *params);
+    bool isFirstToken = true;
+    NSLog(@"Starting token generation loop...");
 
-  while (!generator->IsDone()) {
-    generator->ComputeLogits();
-    generator->GenerateNextToken();
+    startTime = Clock::now();
+    while (!generator->IsDone()) {
+      tokenStartTime = Clock::now();
 
-    const int32_t* seq = generator->GetSequenceData(0);
-    size_t seq_len = generator->GetSequenceCount(0);
-    const char* decode_tokens = tokenizer_stream->Decode(seq[seq_len - 1]);
+      generator->ComputeLogits();
+      generator->GenerateNextToken();
 
-    NSLog(@"Decoded tokens: %s", decode_tokens);
+      if (isFirstToken) {
+        firstTokenTime = Clock::now();
+        isFirstToken = false;
+      }
 
-    // Add decoded token to SharedTokenUpdater
-    NSString* decodedTokenString = [NSString stringWithUTF8String:decode_tokens];
-    [SharedTokenUpdater.shared addDecodedToken:decodedTokenString];
+      // Get the sequence data and decode the token
+      const int32_t* seq = generator->GetSequenceData(0);
+      size_t seq_len = generator->GetSequenceCount(0);
+      const char* decode_tokens = tokenizer_stream->Decode(seq[seq_len - 1]);
+
+      if (!decode_tokens) {
+        throw std::runtime_error("Token decoding failed.");
+      }
+
+      // Measure token generation time excluding logging
+      TimePoint tokenEndTime = Clock::now();
+      auto tokenDuration = std::chrono::duration_cast<std::chrono::milliseconds>(tokenEndTime - tokenStartTime).count();
+      tokenTimes.push_back(tokenDuration);
+      NSString* decodedTokenString = [NSString stringWithUTF8String:decode_tokens];
+      [SharedTokenUpdater.shared addDecodedToken:decodedTokenString];
+    }
+
+    TimePoint endTime = Clock::now();
+    // Log token times
+    NSLog(@"Per-token generation times: %@", [self formatTokenTimes:tokenTimes]);
+
+    // Calculate metrics
+    auto totalDuration = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime).count();
+    auto firstTokenDuration = std::chrono::duration_cast<std::chrono::milliseconds>(firstTokenTime - startTime).count();
+
+    double promptProcRate = (double)promptTokensCount * 1000.0 / firstTokenDuration;
+    double tokenGenRate = (double)(tokenTimes.size() - 1) * 1000.0 / (totalDuration - firstTokenDuration);
+
+    NSLog(@"Token generation completed. Total time: %lld ms, First token time: %lld ms, Total tokens: %zu",
+          totalDuration, firstTokenDuration, tokenTimes.size());
+    NSLog(@"Prompt tokens: %zu, Prompt Processing Rate: %f tokens/s", promptTokensCount, promptProcRate);
+    NSLog(@"Generated tokens: %zu, Token Generation Rate: %f tokens/s", tokenTimes.size(), tokenGenRate);
+
+    NSDictionary* stats = @{@"tokenGenRate" : @(tokenGenRate), @"promptProcRate" : @(promptProcRate)};
+    // notify main thread that token generation is complete
+    dispatch_async(dispatch_get_main_queue(), ^{
+      [[NSNotificationCenter defaultCenter] postNotificationName:@"TokenGenerationStats" object:nil userInfo:stats];
+      [[NSNotificationCenter defaultCenter] postNotificationName:@"TokenGenerationCompleted" object:nil];
+    });
+
+    NSLog(@"Token generation completed.");
+
+  } catch (const std::exception& e) {
+    NSString* errorMessage = [NSString stringWithUTF8String:e.what()];
+    NSLog(@"Error during generation: %@", errorMessage);
+
+    // Send error to the UI
+    NSDictionary* errorInfo = @{@"error" : errorMessage};
+    dispatch_async(dispatch_get_main_queue(), ^{
+      [[NSNotificationCenter defaultCenter] postNotificationName:@"TokenGenerationError" object:nil userInfo:errorInfo];
+    });
   }
 }
+
+// Utility function to format token times for logging
+- (NSString*)formatTokenTimes:(const std::vector<long long>&)tokenTimes {
+  NSMutableString* formattedTimes = [NSMutableString string];
+  for (size_t i = 0; i < tokenTimes.size(); i++) {
+    [formattedTimes appendFormat:@"%lld ms, ", tokenTimes[i]];
+  }
+  return [formattedTimes copy];
+}
+
 @end
diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/README.md b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/README.md
@@ -65,7 +65,7 @@ git clone https://github.com/microsoft/onnxruntime-genai
 
 cd onnxruntime-genai
 
-python3 build.py --parallel --build_dir ./build_iphoneos --ios --ios_sysroot iphoneos --ios_arch arm64 --ios_deployment_target 16.6 --cmake_generator Xcode
+python3 build.py --parallel --build_dir ./build_iphoneos --ios --apple_sysroot iphoneos --osx_arch arm64 --apple_deploy_target 16.6 --cmake_generator Xcode
 
 ```
 
@@ -98,12 +98,14 @@ The app uses Objective-C/C++ since using Generative AI with ONNX Runtime C++ API
 
 Download from hf repo: <https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx/tree/main/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4>
 
-After downloading completes, you need to copy files over to the `Resources` directory in the `Destination` column of `Target-LocalLLM`->`Build Phases`-> `New Copy File Phases` -> `Copy Files`.
+After downloading the files, Click on `LocalLLM` project from sidebar, go to `Targets > LocalLLM > Build Phases`. Find the Copy Files section, set the Destination to Resources, and add the downloaded files.
 
 Upon app launching, Xcode will automatically copy and install the model files from Resources folder and directly download to the iOS device.
 
 ### 4. Run the app and checkout the streaming output token results
 
 **Note**: The current app only sets up with a simple initial prompt question, you can adjust/try your own or refine the UI based on requirements.
 
-***Notice:*** The current Xcode project runs on iOS 16.6, feel free to adjust latest iOS/build for lates iOS versions accordingly.
+***Notice:*** The current Xcode project runs on iOS 16.6, feel free to adjust latest iOS/build for lates iOS versions accordingly.
+
+![alt text](<Simulator Screenshot - iPhone 16.png>)
diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/SharedTokenUpdater.swift b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/SharedTokenUpdater.swift
@@ -14,4 +14,10 @@ import Foundation
             self.decodedTokens.append(token)
         }
     }
+
+    @objc func clearTokens() {
+        DispatchQueue.main.async {
+            self.decodedTokens.removeAll()
+        }
+    }
 }
diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/Simulator Screenshot - iPhone 16.png b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/Simulator Screenshot - iPhone 16.png