Rewrite transcript logic to be more generic

The transcript logic in Invidious was written specifically as a workaround for captions, and not transcripts as a feature. This commit genericises the logic a bit as so it can be used for implementing transcripts within Invidious' API and UI as well. The most notable change is the added parsing of section headings when it was previously skipped over in favor of regular lines.
2025-05-31 14:11:54 +05:30 · 2024-06-11 17:57:33 -07:00
parent eda7444ca4
commit 0224162ad2
2 changed files with 63 additions and 36 deletions
--- a/src/invidious/routes/api/v1/videos.cr
+++ b/src/invidious/routes/api/v1/videos.cr
@@ -89,9 +89,14 @@ module Invidious::Routes::API::V1::Videos

    if CONFIG.use_innertube_for_captions
      params = Invidious::Videos::Transcript.generate_param(id, caption.language_code, caption.auto_generated)
-      initial_data = YoutubeAPI.get_transcript(params)

-      webvtt = Invidious::Videos::Transcript.convert_transcripts_to_vtt(initial_data, caption.language_code)
+      transcript = Invidious::Videos::Transcript.from_raw(
+        YoutubeAPI.get_transcript(params),
+        caption.language_code,
+        caption.auto_generated
+      )
+
+      webvtt = transcript.to_vtt
    else
      # Timedtext API handling
      url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target
--- a/src/invidious/videos/transcript.cr
+++ b/src/invidious/videos/transcript.cr
@@ -1,8 +1,21 @@
 module Invidious::Videos
-  # Namespace for methods primarily relating to Transcripts
-  module Transcript
-    record TranscriptLine, start_ms : Time::Span, end_ms : Time::Span, line : String
+  # A `Transcripts` struct encapsulates a sequence of lines that together forms the whole transcript for a given YouTube video.
+  # These lines can be categorized into two types: section headings and regular lines representing content from the video.
+  struct Transcript
+    # Types
+    record HeadingLine, start_ms : Time::Span, end_ms : Time::Span, line : String
+    record RegularLine, start_ms : Time::Span, end_ms : Time::Span, line : String
+    alias TranscriptLine = HeadingLine | RegularLine

+    property lines : Array(TranscriptLine)
+    property language_code : String
+    property auto_generated : Bool
+
+    # Initializes a new Transcript struct with the contents and associated metadata describing it
+    def initialize(@lines : Array(TranscriptLine), @language_code : String, @auto_generated : Bool)
+    end
+
+    # Generates a protobuf string to fetch the requested transcript from YouTube
    def self.generate_param(video_id : String, language_code : String, auto_generated : Bool) : String
      kind = auto_generated ? "asr" : ""

@@ -30,48 +43,57 @@ module Invidious::Videos
      return params
    end

-    def self.convert_transcripts_to_vtt(initial_data : Hash(String, JSON::Any), target_language : String) : String
-      # Convert into array of TranscriptLine
-      lines = self.parse(initial_data)
+    # Constructs a Transcripts struct from the initial YouTube response
+    def self.from_raw(initial_data : Hash(String, JSON::Any), language_code : String, auto_generated : Bool)
+      body = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer",
+        "content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer",
+        "initialSegments").as_a

+      lines = [] of TranscriptLine
+
+      body.each do |line|
+        if unpacked_line = line["transcriptSectionHeaderRenderer"]?
+          line_type = HeadingLine
+        else
+          unpacked_line = line["transcriptSegmentRenderer"]
+          line_type = RegularLine
+        end
+
+        start_ms = unpacked_line["startMs"].as_s.to_i.millisecond
+        end_ms = unpacked_line["endMs"].as_s.to_i.millisecond
+        text = extract_text(unpacked_line["snippet"]) || ""
+
+        lines << line_type.new(start_ms, end_ms, text)
+      end
+
+      return Transcript.new(
+        lines: lines,
+        language_code: language_code,
+        auto_generated: auto_generated,
+      )
+    end
+
+    # Converts transcript lines to a WebVTT file
+    #
+    # This is used within Invidious to replace subtitles
+    # as to workaround YouTube's rate-limited timedtext endpoint.
+    def to_vtt
      settings_field = {
        "Kind"     => "captions",
-        "Language" => target_language,
+        "Language" => @language_code,
      }

-      # Taken from Invidious::Videos::Captions::Metadata.timedtext_to_vtt()
      vtt = WebVTT.build(settings_field) do |vtt|
-        lines.each do |line|
+        @lines.each do |line|
+          # Section headers are excluded from the VTT conversion as to
+          # match the regular captions returned from YouTube as much as possible
+          next if line.is_a? HeadingLine
+
          vtt.cue(line.start_ms, line.end_ms, line.line)
        end
      end

      return vtt
    end
-
-    private def self.parse(initial_data : Hash(String, JSON::Any))
-      body = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer",
-        "content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer",
-        "initialSegments").as_a
-
-      lines = [] of TranscriptLine
-      body.each do |line|
-        # Transcript section headers. They are not apart of the captions and as such we can safely skip them.
-        if line.as_h.has_key?("transcriptSectionHeaderRenderer")
-          next
-        end
-
-        line = line["transcriptSegmentRenderer"]
-
-        start_ms = line["startMs"].as_s.to_i.millisecond
-        end_ms = line["endMs"].as_s.to_i.millisecond
-
-        text = extract_text(line["snippet"]) || ""
-
-        lines << TranscriptLine.new(start_ms, end_ms, text)
-      end
-
-      return lines
-    end
  end
 end