twitter-archive-parser
twitter-archive-parser copied to clipboard
Bug: some videos use different URLs
This affects the download_better_images.py script, causing them to say e.g.
176/350: Fail. Media couldn't be retrieved: https://video.twimg.com/tweet_video/Q1P1WLseqOz6yoDf.mp4 Filename: media\1332621192075890689-Q1P1WLseqOz6yoDf.mp4
Twitter seems to handle videos differently depending on their size. For different bitrates the correct URLs are available in the JSON.
Proposal:
- parser.py pulls out the highest-quality video URL (and image) and puts it in
media/sources.txt, where each row contains<filename> <URL> - If the user runs download_better_images.py then it reads that file and tries to upgrade each file.
Example of JSON for a tweet with a large video:
"tweet" : {
"retweeted" : false,
"source" : "<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>",
"entities" : {
"user_mentions" : [
{
"name" : "David Chaney",
"screen_name" : "chaoticmass",
"indices" : [
"0",
"12"
],
"id_str" : "34676511",
"id" : "34676511"
},
{
"name" : "Seth Lombardy",
"screen_name" : "sethllombardy",
"indices" : [
"13",
"27"
],
"id_str" : "1340012104837840896",
"id" : "1340012104837840896"
}
],
"urls" : [
{
"url" : "https://t.co/XCoUjiVotX",
"expanded_url" : "https://timhutton.github.io/mobius-transforms/dfs_recipes.html",
"display_url" : "timhutton.github.io/mobius-transfo…",
"indices" : [
"82",
"105"
]
}
],
"symbols" : [ ],
"media" : [
{
"expanded_url" : "https://twitter.com/_tim_hutton_/status/1493679723985178625/video/1",
"indices" : [
"106",
"129"
],
"url" : "https://t.co/SG3FtFVyns",
"media_url" : "http://pbs.twimg.com/ext_tw_video_thumb/1493678749400944655/pu/img/Yp_ThQ_CLafFBcLS.jpg",
"id_str" : "1493678749400944655",
"id" : "1493678749400944655",
"media_url_https" : "https://pbs.twimg.com/ext_tw_video_thumb/1493678749400944655/pu/img/Yp_ThQ_CLafFBcLS.jpg",
"sizes" : {
"thumb" : {
"w" : "150",
"h" : "150",
"resize" : "crop"
},
"medium" : {
"w" : "1200",
"h" : "587",
"resize" : "fit"
},
"small" : {
"w" : "680",
"h" : "333",
"resize" : "fit"
},
"large" : {
"w" : "1880",
"h" : "920",
"resize" : "fit"
}
},
"type" : "photo",
"display_url" : "pic.twitter.com/SG3FtFVyns"
}
],
"hashtags" : [ ]
},
"display_text_range" : [
"0",
"129"
],
"favorite_count" : "2",
"in_reply_to_status_id_str" : "1491490690978791430",
"id_str" : "1493679723985178625",
"in_reply_to_user_id" : "2246902119",
"truncated" : false,
"retweet_count" : "1",
"id" : "1493679723985178625",
"in_reply_to_status_id" : "1491490690978791430",
"possibly_sensitive" : false,
"created_at" : "Tue Feb 15 20:12:52 +0000 2022",
"favorited" : false,
"full_text" : "@chaoticmass @sethllombardy Sliders are now live. Please let me know of any bugs.\nhttps://t.co/XCoUjiVotX https://t.co/SG3FtFVyns",
"lang" : "en",
"in_reply_to_screen_name" : "_tim_hutton_",
"in_reply_to_user_id_str" : "2246902119",
"extended_entities" : {
"media" : [
{
"expanded_url" : "https://twitter.com/_tim_hutton_/status/1493679723985178625/video/1",
"indices" : [
"106",
"129"
],
"url" : "https://t.co/SG3FtFVyns",
"media_url" : "http://pbs.twimg.com/ext_tw_video_thumb/1493678749400944655/pu/img/Yp_ThQ_CLafFBcLS.jpg",
"id_str" : "1493678749400944655",
"video_info" : {
"aspect_ratio" : [
"47",
"23"
],
"duration_millis" : "30966",
"variants" : [
{
"bitrate" : "256000",
"content_type" : "video/mp4",
"url" : "https://video.twimg.com/ext_tw_video/1493678749400944655/pu/vid/550x270/2HJfZEO-rkLt02vO.mp4?tag=12"
},
{
"bitrate" : "832000",
"content_type" : "video/mp4",
"url" : "https://video.twimg.com/ext_tw_video/1493678749400944655/pu/vid/734x360/8ozH3FpjGnHvZYnU.mp4?tag=12"
},
{
"content_type" : "application/x-mpegURL",
"url" : "https://video.twimg.com/ext_tw_video/1493678749400944655/pu/pl/KuttNZoJYYzJIHf4.m3u8?tag=12&container=fmp4"
},
{
"bitrate" : "2176000",
"content_type" : "video/mp4",
"url" : "https://video.twimg.com/ext_tw_video/1493678749400944655/pu/vid/1470x720/iuSYsk2rbOuR91tU.mp4?tag=12"
}
]
},
"additional_media_info" : {
"monetizable" : false
},
"id" : "1493678749400944655",
"media_url_https" : "https://pbs.twimg.com/ext_tw_video_thumb/1493678749400944655/pu/img/Yp_ThQ_CLafFBcLS.jpg",
"sizes" : {
"thumb" : {
"w" : "150",
"h" : "150",
"resize" : "crop"
},
"medium" : {
"w" : "1200",
"h" : "587",
"resize" : "fit"
},
"small" : {
"w" : "680",
"h" : "333",
"resize" : "fit"
},
"large" : {
"w" : "1880",
"h" : "920",
"resize" : "fit"
}
},
"type" : "video",
"display_url" : "pic.twitter.com/SG3FtFVyns"
}
]
}
}