microsoft · koreyspace · Sep 17, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 14, 2024
diff --git a/08-building-search-applications/scripts/prepare_transcripts_ai_show.ps1 b/08-building-search-applications/scripts/prepare_transcripts_ai_show.ps1
@@ -25,10 +25,10 @@ python transcript_enrich_lite.py -f $TRANSCRIPT_FOLDER
 
 # Check if master_enriched.json file exists then rename it to include segment minutes
 if (Test-Path "$TRANSCRIPT_FOLDER\output\master_enriched.json") {
-    Move-Item -Path "$TRANSCRIPT_FOLDER\output\master_enriched.json" -Destination "$TRANSCRIPT_FOLDER\output\embedding_index_full_${TRANSCRIPT_BUCKET_MINUTES}m.json"
+    Move-Item -Path "$TRANSCRIPT_FOLDER\output\master_enriched.json" -Destination "$TRANSCRIPT_FOLDER\output\embedding_index_full_${TRANSCRIPT_BUCKET_MINUTES}m.json" -Force
 }
 
 # Check if master_enriched_lite.json file exists then rename it to include segment minutes
 if (Test-Path "$TRANSCRIPT_FOLDER\output\master_enriched_lite.json") {
-    Move-Item -Path "$TRANSCRIPT_FOLDER\output\master_enriched_lite.json" -Destination "$TRANSCRIPT_FOLDER\output\embedding_index_${TRANSCRIPT_BUCKET_MINUTES}m.json"
+    Move-Item -Path "$TRANSCRIPT_FOLDER\output\master_enriched_lite.json" -Destination "$TRANSCRIPT_FOLDER\output\embedding_index_${TRANSCRIPT_BUCKET_MINUTES}m.json" -Force
 }
diff --git a/08-building-search-applications/scripts/transcript_enrich_bucket.py b/08-building-search-applications/scripts/transcript_enrich_bucket.py
@@ -172,20 +172,23 @@ def parse_json_vtt_transcript(vtt, metadata):
 
         # Append the last text segment to the last segment in segments dictionary
         if seg_begin_seconds and text != "":
-            previous_segment_tokens = len(tokenizer.encode(segments[-1]["text"]))
-            current_segment_tokens = len(tokenizer.encode(text))
-
-            if previous_segment_tokens + current_segment_tokens < MAX_TOKENS:
-                segments[-1]["text"] += text
+            if segments:
+                previous_segment_tokens = len(tokenizer.encode(segments[-1]["text"]))
+                current_segment_tokens = len(tokenizer.encode(text))
+
+                if previous_segment_tokens + current_segment_tokens < MAX_TOKENS:
+                    segments[-1]["text"] += text
+                else:
+                    if not first_segment:
+                        # append PERCENTAGE_OVERLAP text to the previous segment
+                        # to smooth context transition
+                        append_text_to_previous_segment(text)
+                    first_segment = False
+                    add_new_segment(metadata, text, seg_begin_seconds)
             else:
-                if not first_segment:
-                    # append PERCENTAGE_OVERLAP text to the previous segment
-                    # to smooth context transition
-                    append_text_to_previous_segment(text)
-                first_segment = False
+                 # If segments list is empty, add the text as a new segment
                 add_new_segment(metadata, text, seg_begin_seconds)
 
-
 def get_transcript(metadata):
     """get the transcript from the .vtt file"""
     global total_files