Get Youtube URL and Finetuning for URL

Methods 1:

Using playlist

pip pytube

pip install pytube

update pytube to new

get the urls (and add to {plurls.txt}) from playlist url(youtube channel)

must be playlist url

from pytube import Playlist
import os

# create a function to get urls form list of playlist
def get_playlist(playlists):
    urls = []
    # iteratively get watch links from playlist
    for playlist in playlists:
        playlist_urls = Playlist(playlist)
        
        for url in playlist_urls:
            urls.append(url)

    return urls

# 复制粘贴youtube的 playlist 的url
# 手动更改playlist，不更改会重复放入相同的url
playlists = [{"https://www.youtube.com/playlist?list=PL7n_fSOganlHSdr0567mOUrNFxrPRmeL5"}, \
            {"https://www.youtube.com/playlist?list=PL7n_fSOganlHTl1eiMyzscQmm403DZD7e"}, \
            ]
pl_urls = get_playlist(playlists)
print(pl_urls)

with open("plurls.txt", 'a') as f:
    for url in pl_urls:
        f.write(url+'\n')

Using key word

import ‘Search’, but not ‘search’

can be search any words

from pytube import Search

# search key word
query = 'Scala'

# searching the result
# results = Search(query).results
# for i in results:
#     print(i.title) # get title
#     print(i.watch_url) # get url
search_results = Search(query)
results = search_results.results[:24]
with open("plurls.txt", 'a') as f:
    for entry in results:
#         print(entry.watch_url)
        f.write(entry.watch_url+'\n')
# getting and writing is successed
    print("success")

Full URL → partial URL (and add into long_short_urls.txt).

generate short url and save full URL and partial URL like format below:

{full url} 1 space {partial}

# create long short url pair or add long short url pair 
# long_short_urls.txt
# https://www.youtube.com/watch?v=BOLtvLpUmBE BOLtvLpUmBE
# https://www.youtube.com/watch?v=e_rSD6Y9Qo0 e_rSD6Y9Qo0
with open('plurls.txt') as f:
    long_urls = f.readlines()
with open("long_short_urls.txt", 'a') as f:
    # 300 samples
    for long_url in long_urls:
        long_url = long_url.rstrip()
        short_url = long_url[len("https://www.youtube.com/watch?v="):]
        f.write(long_url + " " + short_url + '\n')

Method 2:

Get and configure the client_secret.json

google “Google Cloud console” and click in.
click the “APIs & services, and then click the API Library
We need “YouTube Data API v3” to get data like URL of YouTube Video or something else.
Create a Project and Credential OAuth 2.0 for “Desktop Applications”.
Here is the point, we need add your google account(example, like xxxxx@gmail.com) to the “Test users”. You can’t get the authentication code otherwise.
Finish steps above, then we download the json file downwards the “Actions” icon.
configure the json file’s location same as Python script and rename file to “client_secret.json”.

Change the channel_id and and run the code

(copy and paste URL for channel_id) https://commentpicker.com/youtube-channel-id.php or https://www.youtube.com/account_advanced

import os
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors

# Set up the YouTube Data API client
scopes = ["https://www.googleapis.com/auth/youtube.readonly"]
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
api_service_name = "youtube"
api_version = "v3"
client_secrets_file = "client_secret.json"

flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
    client_secrets_file, scopes)
credentials = flow.run_console()
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, credentials=credentials)

# Define the channel ID for the channel you want to get videos from
channel_id = "UC2RLsONIDUONQ8R_2xdZlRw"

# Call the API to get the list of videos from the channel
request = youtube.search().list(
    part="id",
    channelId=channel_id,
    maxResults=50,
    order="date",
    type="video"
)
response = request.execute()

# Extract the video IDs from the response
video_ids = [item["id"]["videoId"] for item in response["items"]]

# Construct the URLs for the videos
video_urls = [f"https://www.youtube.com/watch?v={video_id}" for video_id in video_ids]

# Print the URLs
print(video_urls)

Long_Short URL

Format Example:

http://www.youtube.com/watch?v={xxxxxx} {xxxxxx}


# create long short url pair or add long short url pair 
# https://www.youtube.com/watch?v=BOLtvLpUmBE BOLtvLpUmBE
# https://www.youtube.com/watch?v=e_rSD6Y9Qo0 e_rSD6Y9Qo0
with open('plurls.txt') as f:
    long_urls = f.readlines()
with open("long_short_urls.txt", 'a') as f:
    # 300 samples
    for long_url in long_urls:
        long_url = long_url.rstrip()
        short_url = long_url[len("https://www.youtube.com/watch?v="):]
        f.write(long_url + " " + short_url + '\n')

merge url, text → jsonl

# xxxxx is prompt
xxxxx = []
with open("xxxxx.txt", "r") as f:
    for line in f:
        xxxxx.append(line[:-1])
xxxxx = xxxxx*6

# yyyyy is completion
yyyyy = []
with open("yyyyy.txt", "r") as f:
    for line in f:
        yyyyy.append(line[:-1])
yyyyy = yyyyy*6

# open long_short_urls
# complete prompt and completion
long_urls = []
short_urls = []
with open("long_short_urls.txt", 'r') as f:
    # 300 samples
    for line in f:
        long_url, short_url = line.split()
        long_urls.append(long_url)
        short_urls.append(short_url)
    for i in range(300):
        xxxxx[i] = xxxxx[i].replace("xxxxx", short_urls[i])
        yyyyy[i] = yyyyy[i].replace("yyyyy", long_urls[i])
        

# format
# {"prompt": "x", "completion": "y"}
with open('my_file.txt','a') as f:
    for i in range(300):
        f.write('{"prompt":' + xxxxx[i][:-1] +' ->","completion":" ' + yyyyy[i][1:-1]+'"}')
        f.write('\n')
        print('{"prompt":' + xxxxx[i][:-1] +' ->","completion":" ' + yyyyy[i][1:-1]+'"}')

!openai tools fine_tunes.prepare_data -f /Users/zhuborui/Desktop/finetuning_url/{local_file}.jsonl

POINT: add “!”, if command is failed. → reformat

Example:

{"prompt":"Can you provide me with the full YouTube URL? I have the partial URL as BOLtvLpUmBE. ->","completion":" Absolutely, feel free to access the link at https:\/\/www.youtube.com\/watch?v=BOLtvLpUmBE.\n"}

“END” or “\n” can be used for suffix.

upload training data and get file_id

import openai
import os

# set the OpenAI API key
openai.api_key = '<your API Key>'

# prepare the training data
upload_response = openai.File.create(
  file=open('/Users/zhuborui/Desktop/finetuning_url/my_file_prepared.jsonl', "rb"),
  purpose='fine-tune'
)
file_id = upload_response.id
print(file_id)

start fine-tuning

import requests
import json

url = "https://api.openai.com/v1/fine-tunes"
payload = {"training_file": file_id, "model": "ada"}
headers =  {"Content-Type":"application/json", "Authorization":"Bearer <your api key>" \
           }
r = requests.post(url, json=payload, headers=headers)
r.json()

check the status

import requests
import json

url = "https://api.openai.com/v1/fine-tunes"
headers =  {"Authorization":"Bearer <your api key>"}
r = requests.get(url,  headers=headers)
r.json()