SubtitlesParser icon indicating copy to clipboard operation
SubtitlesParser copied to clipboard

Stream is not in a valid Youtube XML format

Open nazar322 opened this issue 2 years ago • 1 comments

YtXmlFormatParser.Parse causes System.ArgumentException: 'Stream is not in a valid Youtube XML format'

The code is as follows

List<SubtitlesParser.Classes.SubtitleItem> subtitleItems;
var ytSubtitlesParser = new SubtitlesParser.Classes.Parsers.YtXmlFormatParser();

using (var stream = new MemoryStream(Encoding.UTF8.GetBytes(subtitles)))
{
     subtitleItems = ytSubtitlesParser.ParseStream(stream, Encoding.UTF8);
}

YouTube captions attached yt-video-oPnDOxMXlUc.zip

nazar322 avatar Apr 26 '22 12:04 nazar322

private string ConvertYouTubeXmlToSrtFormat(string ytSubtitles)
        {
            var expression = new Regex("<p\\st=\"(?<timestamp>\\d+)\"\\sd=\"(?<duration>\\d+)\">(?<text>.*?)</p>", RegexOptions.Singleline);
            var matchedSubtitles = expression.Matches(ytSubtitles);

            if (matchedSubtitles.Count == 0) throw new Exception("Failed to extract subtitles");

            var srtWriter = new StringBuilder(matchedSubtitles.Count * 50);

            for (var i = 0; i < matchedSubtitles.Count; i++)
            {
                var matchedSubtitle = matchedSubtitles[i];

                srtWriter.AppendLine((i + 1).ToString()); // sequence number

                var timestamp = new TimeSpan(0, 0, 0, 0, matchedSubtitle.Groups["timestamp"].Value.ToInt32());
                var duration = new TimeSpan(0, 0, 0, 0, matchedSubtitle.Groups["duration"].Value.ToInt32());
                srtWriter.AppendLine($"{timestamp:hh\\:mm\\:ss\\,fff} --> {timestamp + duration:hh\\:mm\\:ss\\,fff}"); // timestamps

                srtWriter.AppendLine(matchedSubtitle.Groups["text"].Value); // text

                srtWriter.AppendLine();
            }

            return srtWriter.ToString();
        }

nazar322 avatar Apr 26 '22 17:04 nazar322