Use yt-dlp for interaction with YouTube

Handle existing videos by ID rather than date Some tweaks
2022-05-11 13:07:50 +02:00 · 2022-05-11 13:07:50 +02:00 · d9f0227b1b
parent 05ee31f3ba
commit d9f0227b1b
6 changed files with 99 additions and 135 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,6 @@
+venv/*
+*.pyc
+__pycache__/
+config.toml
+*.tmp
+video_list.txt
--- a/README.md
+++ b/README.md
@ -2,13 +2,13 @@

 YouTube2PeerTube is a bot written in Python3 that mirrors YouTube channels to PeerTube channels as videos are released in a YouTube channel.

-It checks YouTube channels periodically, when new videos are found, it mirrors them with metadata to PeerTube corresponding peertube channels.
+It checks YouTube channels periodically. When new videos are found, it mirrors them with metadata to corresponding PeerTube channels.

 This tool supports multiple channels, and supports mirroring each YouTube channel to a user defined PeerTube channel and instance that can be different for each YouTube channel being mirrored.

-This tool does not use YouTube APIs. Instead, it subscribes to channels via RSS. This is a primary feature, this tool will always avoid the YouTube API, and no features will be implemented that require the YouTube API.
+This tool uses yt-dlp to retrieve the required information from the YouTube channel you wish to mirror. This is a central feature as it will make sure that *all* available videos are mirrored.

-If you need to archive a YouTube channel with lots of existing videos, this tool is not for you. This tool starts mirroring channels from the time they are added to the config and will not mirror all historical videos that exist in a YouTube channel. A tool that provides this functionality is available https://github.com/Chocobozzz/PeerTube/blob/develop/support/doc/tools.md#peertube-import-videosjs
+If you need to archive a YouTube channel with lots of existing videos, this tool is for you. This tool starts mirroring channels all historical videos that exist in a YouTube channel. A tool that provides this functionality is available https://github.com/Chocobozzz/PeerTube/blob/develop/support/doc/tools.md#peertube-import-videosjs

 ## Installation

@ -26,15 +26,15 @@ This will create a virtual environment for the tool and install all dependencies

 This tool depends on:

- pafy https://github.com/mps-youtube/pafy for downloading of YouTube content.
+- yt-dlp for downloading of YouTube content

- feedparser for parsing of RSS data
+- requests for other HTTP requests

 - TOML for the configuration file

 - MultipartEncoder from requests_toolbelt

- urllib.request, requests, mimetypes, time, json and os from the Python standard library
+- urllib.request, datetime, mimetypes, time, json and os from the Python standard library

 It also contains heavily modified components from prismedia https://git.lecygnenoir.info/LecygneNoir/prismedia for uploading videos and metadata to PeerTube.

@ -84,22 +84,4 @@ Please open issues if you find any problems or if you have a feature request. Pu

 ## Thanks!

-Thanks to the mps-youtube project https://github.com/mps-youtube for pafy, and thanks to LecygneNoir https://git.lecygnenoir.info/LecygneNoir for the prismedia project. Thank you Tom for TOML and as always, Guido and the Python team.
-
-If you find this tool useful and would like to donate, the following donation options are available:
-
-XMR: 4AeufJrhpQn7LGW5dZ9tH4FFAtfmRwEDvhYrH5GQDbNxQ9VyWKmdycb5naWcvRTqbm3fkyqrDi23x453stDKzu5YVgPfcbj
-
-BTC legacy: 141HaN7bq781BaB2PRP8mkUndebZXjxiFU
-
-BTC segwit compatible: bc1qx2fa50av3j9hrjnszsnpflmtxqnz08936mq4xx
-
-BCH: qzr9gk7tv274x9u9sft243m729zrjnq0cvpzlelapt
-
-LTC: ltc1qa8re5eh2dklzfhg2x03tswsr5wae68qfxjzacd
-
-ETH: 0x18304c5ed37dacefc920b291f39b06545b5fc258
-
-ETC: 0xee3947eec103346ed42302221d99027a59bfa061
-
-Buy me a cup of coffee!
+Thanks to Mister Monster for the original YouTube2PeerTube project (https://github.com/mister-monster/YouTube2PeerTube), and thanks to LecygneNoir https://git.lecygnenoir.info/LecygneNoir for the prismedia project. Thank you Tom for TOML and as always, Guido and the Python team.
--- a/channels_timestamps.csv
+++ b/channels_timestamps.csv
@ -1 +0,0 @@
-
--- a/requirements.txt
+++ b/requirements.txt
@ -1,10 +1,8 @@
 certifi==2019.11.28
 chardet==3.0.4
-feedparser==5.2.1
 idna==2.8
-pafy==0.5.5
 requests==2.22.0
 requests-toolbelt==0.9.1
 toml==0.10.0
-urllib3==1.26.5
 youtube-dl==2019.11.28
+yt-dlp
--- a/utils.py
+++ b/utils.py
@ -1,5 +1,7 @@
 import toml

+from datetime import datetime
+
 def read_conf(conf_file):
    conf_file = open(conf_file)
    conf = conf_file.read()
@ -7,13 +9,9 @@ def read_conf(conf_file):
    conf_file.close()
    return conf

-def convert_timestamp(timestamp):
-    timestamp = timestamp.split('T')
-    date = timestamp[0].split('-')
-    time = timestamp[1].split('+')
-    time = time[0].split(':')
-    timestamp = int(date[0] + date[1] + date[2] + time[0] + time[1] + time[2])
-    return timestamp
+def get_originally_uploaded_pt(date):
+    dt = datetime.strptime(date, "%Y%m%d")
+    return dt.isoformat()

 def set_pt_lang(yt_lang, conf_lang):
    YOUTUBE_LANGUAGE = {
@ -46,6 +44,7 @@ def set_pt_lang(yt_lang, conf_lang):
        "russian": "ru",
        "spanish": "es"
    }
+
    # if youtube provides a language value
    if yt_lang != None:
        # if the language value is a value and not a key
@ -70,7 +69,6 @@ def set_pt_lang(yt_lang, conf_lang):
    return lang

 def set_pt_category(category_str):
-    print(category_str)
    PEERTUBE_CATEGORY = {
        "music": 1,
        "films": 2,
--- a/youtube2peertube.py
+++ b/youtube2peertube.py
@ -2,101 +2,74 @@

 import sys
 import getopt
-import pafy
-import feedparser as fp
-from urllib.request import urlretrieve
+from urllib.request import urlopen
+from urllib.error import HTTPError
 import requests
 import json
 from time import sleep
 from os import mkdir, path
 from shutil import rmtree
-import mimetypes
 from requests_toolbelt.multipart.encoder import MultipartEncoder
 import utils
+import yt_dlp
+from pathlib import Path
+
+VIDEO_LIST_FILENAME = "video_list.txt"

 def get_video_data(channel_id):
-    yt_rss_url = "https://www.youtube.com/feeds/videos.xml?channel_id=" + channel_id
-    feed = fp.parse(yt_rss_url)
-    channel_lang = feed["feed"]["title_detail"]["language"]
-    print(feed["feed"])
-    entries = feed["entries"]
-    channels_timestamps = "channels_timestamps.csv"
-    # clear any existing queue before start
+    with yt_dlp.YoutubeDL() as ydl:
+        channel = ydl.extract_info(
+            "https://youtube.com/channel/" + channel_id,
+            download=False)
+
+    entries = channel["entries"]
+
    queue = []
-    # read contents of channels_timestamps.csv, create list object of contents
-    ct = open(channels_timestamps, "r")
-    ctr = ct.read().split("\n")
-    ct.close()
-    ctr_line = []
-    channel_found = False
-    # check if channel ID is found in channels_timestamps.csv
-    for line in ctr:
-        line_list = line.split(',')
-        if channel_id == line_list[0]:
-            channel_found = True
-            ctr_line = line
-            break
-    if not channel_found:
-        print("new channel added to config: " + channel_id)
-    print(channel_id)
-    # iterate through video entries for channel, parse data into objects for use
+
+    try:
+        with open(VIDEO_LIST_FILENAME, "r") as video_list_file:
+            video_list = video_list_file.read().split("\n")
+    except FileNotFoundError:
+        video_list = []
+
    for pos, i in enumerate(reversed(entries)):
-        published = i["published"]
-        updated = i["updated"]
-        if not channel_found:
-            # add the video to the queue
+        published = i["upload_date"]
+        if not i["id"] in video_list:
            queue.append(i)
-            ctr_line = str(channel_id + "," + published + "," + updated + '\n')
-            # add the new line to ctr for adding to channels_timestamps later
-            ctr.append(ctr_line)
-            channel_found = True
-        # if the channel exists in channels_timestamps, update "published" time in the channel line
-        else:
-            published_int = utils.convert_timestamp(published)
-            ctr_line_list = ctr_line.split(",")
-            line_published_int = utils.convert_timestamp(ctr_line_list[1])
-            if published_int > line_published_int:
-                # update the timestamp in the line for the channel in channels_timestamps,
-                ctr.remove(ctr_line)
-                ctr_line = str(channel_id + "," + published + "," + updated + '\n')
-                ctr.append(ctr_line)
-                # and add current videos to queue.
-                queue.append(i)
-        print(published)
-    # write the new channels and timestamps line to channels_timestamps.csv
-    ct = open(channels_timestamps, "w")
-    for line in ctr:
-        if line != '':
-            ct.write(line + "\n")
-    ct.close()
-    return queue, channel_lang
+            video_list.append(i["id"])
+
+    return queue
+
+def write_completion(video_id):
+    with open(VIDEO_LIST_FILENAME, "a") as video_list_file:
+        video_list_file.write("\n" + video_id)

 def download_yt_video(queue_item, dl_dir, channel_conf):
-    url = queue_item["link"]
+    url = queue_item["original_url"]
    dl_dir = dl_dir + channel_conf["name"]
    try:
-        video = pafy.new(url)
-        streams = video.streams
-        #for s in streams:
-            #print(s.resolution, s.extension, s.get_filesize, s.url)
-        best = video.getbest(preftype=channel_conf["preferred_extension"])
        filepath = dl_dir + "/"+ queue_item["yt_videoid"] + "." + channel_conf["preferred_extension"]
-        #TODO: implement resolution logic from config, currently downloading best resolution
-        best.download(filepath=filepath, quiet=False)
+        ydl_opts = {
+            "format": "best",
+            "output": filepath,
+            "noplaylist": true,
+            "merge-output-format": channel_conf["preferred_extension"],
+        }
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([url])

    except:
        pass
-        # TODO: check YT alternate URL for video availability
        # TODO: print and log exceptions

 def save_metadata(queue_item, dl_dir, channel_conf):
    dl_dir = dl_dir + channel_conf["name"]
-    link = queue_item["link"]
+    link = queue_item["original_url"]
    title = queue_item["title"]
-    description = queue_item["summary"]
-    author = queue_item["author"]
-    published = queue_item["published"]
-    metadata_file = dl_dir + "/" + queue_item["yt_videoid"] + ".txt"
+    description = queue_item["description"]
+    author = queue_item["uploader"]
+    published = queue_item["upload_date"]
+    metadata_file = dl_dir + "/" + queue_item["id"] + ".txt"
    metadata = open(metadata_file, "w+")
    # save relevant metadata as semicolon separated easy to read values to text file
    metadata.write('title: "' + title + '";\n\nlink: "' + link + '";\n\nauthor: "' + author + '";\n\npublished: "' +
@ -106,12 +79,24 @@ def save_metadata(queue_item, dl_dir, channel_conf):
    metadata.close()

 def save_thumbnail(queue_item, dl_dir, channel_conf):
-    dl_dir = dl_dir + channel_conf["name"]
-    thumb = str(queue_item["media_thumbnail"][0]["url"])
-    extension = thumb.split(".")[-1]
-    thumb_file = dl_dir + "/" + queue_item["yt_videoid"] + "." + extension
-    # download the thumbnail
-    urlretrieve(thumb, thumb_file)
+    dl_dir = Path(dl_dir) / channel_conf["name"]
+
+    url = "https://i.ytimg.com/vi_webp/%s/maxresdefault.webp" % queue_item["id"]
+    extension = "webp"
+
+    outfile = dl_dir / (queue_item["id"] + "." + extension) 
+
+    try:
+        data = urlopen(url).read()
+    except HTTPError as e:
+        if e.code == 404:
+            data = e.read()
+        else:
+            raise
+
+    with open(outfile, "wb") as out:
+        out.write(data)
+
    return extension

 def get_pt_auth(channel_conf):
@ -144,9 +129,8 @@ def get_pt_channel_id(channel_conf):
    return channel_id

 def get_file(file_path):
-    mimetypes.init()
    return (path.basename(file_path), open(path.abspath(file_path), 'rb'),
-            mimetypes.types_map[path.splitext(file_path)[1]])
+            "image/webp")


 def handle_peertube_result(request_result):
@ -159,10 +143,10 @@ def handle_peertube_result(request_result):
 def upload_to_pt(dl_dir, channel_conf, queue_item, access_token, thumb_extension):
    # Adapted from Prismedia https://git.lecygnenoir.info/LecygneNoir/prismedia
    pt_api = channel_conf["peertube_instance"] + "/api/v1"
-    video_file = dl_dir + channel_conf["name"] + "/" + queue_item["yt_videoid"] + "." + \
+    video_file = dl_dir + channel_conf["name"] + "/" + queue_item["id"] + "." + \
                 channel_conf["preferred_extension"]
-    thumb_file = dl_dir + channel_conf["name"] + "/" + queue_item["yt_videoid"] + "." + thumb_extension
-    description = channel_conf["description_prefix"] + "\n\n" + queue_item["summary"] + "\n\n" + channel_conf["description_suffix"]
+    thumb_file = dl_dir + channel_conf["name"] + "/" + queue_item["id"] + "." + thumb_extension
+    description = channel_conf["description_prefix"] + "\n\n" + queue_item["description"] + "\n\n" + channel_conf["description_suffix"]
    channel_id = str(get_pt_channel_id(channel_conf))
    category = utils.set_pt_category(channel_conf["pt_channel_category"])
    # We need to transform fields into tuple to deal with tags as
@ -176,7 +160,7 @@ def upload_to_pt(dl_dir, channel_conf, queue_item, access_token, thumb_extension
            ("description", description),
            ("nsfw", channel_conf["nsfw"]),
            ("channelId", channel_id),
-            ("originallyPublishedAt", queue_item["published"]),
+            ("originallyPublishedAt", utils.get_originally_uploaded_pt(queue_item["upload_date"])),
            ("category", category),
            ("language", channel_conf["default_lang"]),
            ("privacy", str(channel_conf["pt_privacy"])),
@ -201,16 +185,14 @@ def upload_to_pt(dl_dir, channel_conf, queue_item, access_token, thumb_extension
    
    return handle_peertube_result(requests.post(pt_api + "/videos/upload", data=multipart_data, headers=headers))

-def pt_http_import(dl_dir, channel_conf, queue_item, access_token, thumb_extension, yt_lang):
+def pt_http_import(dl_dir, channel_conf, queue_item, access_token, thumb_extension):
    # Adapted from Prismedia https://git.lecygnenoir.info/LecygneNoir/prismedia
    pt_api = channel_conf["peertube_instance"] + "/api/v1"
-    yt_video_url = queue_item["link"]
-    # TODO: use the alternate link if video not found error occurs
-    alternate_link = queue_item["links"][0]["href"]
-    thumb_file = dl_dir + channel_conf["name"] + "/" + queue_item["yt_videoid"] + "." + thumb_extension
-    description = channel_conf["description_prefix"] + "\n\n" + queue_item["summary"] + "\n\n" + channel_conf["description_suffix"]
+    yt_video_url = queue_item["original_url"]
+    thumb_file = dl_dir + channel_conf["name"] + "/" + queue_item["id"] + "." + thumb_extension
+    description = channel_conf["description_prefix"] + "\n\n" + queue_item["description"] + "\n\n" + channel_conf["description_suffix"]
    channel_id = str(get_pt_channel_id(channel_conf))
-    language = utils.set_pt_lang(yt_lang, channel_conf["default_lang"])
+    language = utils.set_pt_lang(None, channel_conf["default_lang"])
    category = utils.set_pt_category(channel_conf["pt_channel_category"])
    # We need to transform fields into tuple to deal with tags as
    # MultipartEncoder does not support list refer
@ -222,7 +204,7 @@ def pt_http_import(dl_dir, channel_conf, queue_item, access_token, thumb_extensi
        ("description", description),
        ("nsfw", channel_conf["nsfw"]),
        ("channelId", channel_id),
-        ("originallyPublishedAt", queue_item["published"]),
+        ("originallyPublishedAt", utils.get_originally_uploaded_pt(queue_item["upload_date"])),
        ("category", category),
        ("language", language),
        ("privacy", str(channel_conf["pt_privacy"])),
@ -274,16 +256,14 @@ def run_steps(conf):
        print("\n")
        channel_id = channel[c]["channel_id"]
        channel_conf = channel[str(channel_counter)]
-        video_data = get_video_data(channel_id)
-        queue = video_data[0]
-        yt_lang = video_data[1]
+        queue = get_video_data(channel_id)
        if len(queue) > 0:
            if not path.exists(dl_dir + "/" + channel_conf["name"]):
                mkdir(dl_dir + "/" + channel_conf["name"])
            # download videos, metadata and thumbnails from youtube
            for queue_item in queue:
                if not use_pt_http_import:
-                    print("downloading " + queue_item["yt_videoid"] + " from YouTube...")
+                    print("downloading " + queue_item["id"] + " from YouTube...")
                    download_yt_video(queue_item, dl_dir, channel_conf)
                    print("done.")
                # TODO: download closest to config specified resolution instead of best resolution
@ -297,17 +277,18 @@ def run_steps(conf):
            # upload videos, metadata and thumbnails to peertube
            for queue_item in queue:
                if not use_pt_http_import:
-                    print("uploading " + queue_item["yt_videoid"] + " to Peertube...")
+                    print("uploading " + queue_item["id"] + " to Peertube...")
                    pt_result = upload_to_pt(dl_dir, channel_conf, queue_item, access_token, thumb_extension)
                
                else:
-                    print("mirroring " + queue_item["link"] + " to Peertube using HTTP import...")
-                    pt_result = pt_http_import(dl_dir, channel_conf, queue_item, access_token, thumb_extension, yt_lang)
+                    print("mirroring " + queue_item["original_url"] + " to Peertube using HTTP import...")
+                    pt_result = pt_http_import(dl_dir, channel_conf, queue_item, access_token, thumb_extension)

                if pt_result:
+                    write_completion(queue_item["id"])
                    print("done !")
                else:
-                    log_upload_error(queue_item["link"],channel_conf)
+                    log_upload_error(queue_item["original_url"],channel_conf)
            if delete_videos:
                print("deleting videos and/or thumbnails...")
                rmtree(dl_dir + "/" + channel_conf["name"], ignore_errors=True)