Get Youtube URL and Finetuning for URL
Methods 1:
- Using playlist
pip pytube
pip install pytube
update pytube to new
get the urls (and add to {plurls.txt}) from playlist url(youtube channel)
must be playlist url
from pytube import Playlist
import os
# create a function to get urls form list of playlist
def get_playlist(playlists):
urls = []
# iteratively get watch links from playlist
for playlist in playlists:
playlist_urls = Playlist(playlist)
for url in playlist_urls:
urls.append(url)
return urls
# 复制粘贴youtube的 playlist 的url
# 手动更改playlist,不更改会重复放入相同的url
playlists = [{"https://www.youtube.com/playlist?list=PL7n_fSOganlHSdr0567mOUrNFxrPRmeL5"}, \
{"https://www.youtube.com/playlist?list=PL7n_fSOganlHTl1eiMyzscQmm403DZD7e"}, \
]
pl_urls = get_playlist(playlists)
print(pl_urls)
with open("plurls.txt", 'a') as f:
for url in pl_urls:
f.write(url+'\n')
- Using key word
import ‘Search’, but not ‘search’
can be search any words
from pytube import Search
# search key word
query = 'Scala'
# searching the result
# results = Search(query).results
# for i in results:
# print(i.title) # get title
# print(i.watch_url) # get url
search_results = Search(query)
results = search_results.results[:24]
with open("plurls.txt", 'a') as f:
for entry in results:
# print(entry.watch_url)
f.write(entry.watch_url+'\n')
# getting and writing is successed
print("success")
- Full URL → partial URL (and add into long_short_urls.txt).
generate short url and save full URL and partial URL like format below:
{full url} 1 space {partial}
# create long short url pair or add long short url pair
# long_short_urls.txt
# https://www.youtube.com/watch?v=BOLtvLpUmBE BOLtvLpUmBE
# https://www.youtube.com/watch?v=e_rSD6Y9Qo0 e_rSD6Y9Qo0
with open('plurls.txt') as f:
long_urls = f.readlines()
with open("long_short_urls.txt", 'a') as f:
# 300 samples
for long_url in long_urls:
long_url = long_url.rstrip()
short_url = long_url[len("https://www.youtube.com/watch?v="):]
f.write(long_url + " " + short_url + '\n')
Method 2:
Get and configure the client_secret.json
- google “Google Cloud console” and click in.
- click the “APIs & services, and then click the API Library
- We need “YouTube Data API v3” to get data like URL of YouTube Video or something else.
- Create a Project and Credential OAuth 2.0 for “Desktop Applications”.
- Here is the point, we need add your google account(example, like xxxxx@gmail.com) to the “Test users”. You can’t get the authentication code otherwise.
- Finish steps above, then we download the json file downwards the “Actions” icon.
- configure the json file’s location same as Python script and rename file to “client_secret.json”.
Change the channel_id and and run the code
(copy and paste URL for channel_id) https://commentpicker.com/youtube-channel-id.php or https://www.youtube.com/account_advanced
import os
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
# Set up the YouTube Data API client
scopes = ["https://www.googleapis.com/auth/youtube.readonly"]
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
api_service_name = "youtube"
api_version = "v3"
client_secrets_file = "client_secret.json"
flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
client_secrets_file, scopes)
credentials = flow.run_console()
youtube = googleapiclient.discovery.build(
api_service_name, api_version, credentials=credentials)
# Define the channel ID for the channel you want to get videos from
channel_id = "UC2RLsONIDUONQ8R_2xdZlRw"
# Call the API to get the list of videos from the channel
request = youtube.search().list(
part="id",
channelId=channel_id,
maxResults=50,
order="date",
type="video"
)
response = request.execute()
# Extract the video IDs from the response
video_ids = [item["id"]["videoId"] for item in response["items"]]
# Construct the URLs for the videos
video_urls = [f"https://www.youtube.com/watch?v={video_id}" for video_id in video_ids]
# Print the URLs
print(video_urls)
Long_Short URL
Format Example:
http://www.youtube.com/watch?v={xxxxxx} {xxxxxx}
# create long short url pair or add long short url pair
# https://www.youtube.com/watch?v=BOLtvLpUmBE BOLtvLpUmBE
# https://www.youtube.com/watch?v=e_rSD6Y9Qo0 e_rSD6Y9Qo0
with open('plurls.txt') as f:
long_urls = f.readlines()
with open("long_short_urls.txt", 'a') as f:
# 300 samples
for long_url in long_urls:
long_url = long_url.rstrip()
short_url = long_url[len("https://www.youtube.com/watch?v="):]
f.write(long_url + " " + short_url + '\n')
merge url, text → jsonl
# xxxxx is prompt
xxxxx = []
with open("xxxxx.txt", "r") as f:
for line in f:
xxxxx.append(line[:-1])
xxxxx = xxxxx*6
# yyyyy is completion
yyyyy = []
with open("yyyyy.txt", "r") as f:
for line in f:
yyyyy.append(line[:-1])
yyyyy = yyyyy*6
# open long_short_urls
# complete prompt and completion
long_urls = []
short_urls = []
with open("long_short_urls.txt", 'r') as f:
# 300 samples
for line in f:
long_url, short_url = line.split()
long_urls.append(long_url)
short_urls.append(short_url)
for i in range(300):
xxxxx[i] = xxxxx[i].replace("xxxxx", short_urls[i])
yyyyy[i] = yyyyy[i].replace("yyyyy", long_urls[i])
# format
# {"prompt": "x", "completion": "y"}
with open('my_file.txt','a') as f:
for i in range(300):
f.write('{"prompt":' + xxxxx[i][:-1] +' ->","completion":" ' + yyyyy[i][1:-1]+'"}')
f.write('\n')
print('{"prompt":' + xxxxx[i][:-1] +' ->","completion":" ' + yyyyy[i][1:-1]+'"}')
!openai tools fine_tunes.prepare_data -f /Users/zhuborui/Desktop/finetuning_url/{local_file}.jsonl
POINT: add “!”, if command is failed. → reformat
Example:
{"prompt":"Can you provide me with the full YouTube URL? I have the partial URL as BOLtvLpUmBE. ->","completion":" Absolutely, feel free to access the link at https:\/\/www.youtube.com\/watch?v=BOLtvLpUmBE.\n"}
“END” or “\n” can be used for suffix.
upload training data and get file_id
import openai
import os
# set the OpenAI API key
openai.api_key = '<your API Key>'
# prepare the training data
upload_response = openai.File.create(
file=open('/Users/zhuborui/Desktop/finetuning_url/my_file_prepared.jsonl', "rb"),
purpose='fine-tune'
)
file_id = upload_response.id
print(file_id)
start fine-tuning
import requests
import json
url = "https://api.openai.com/v1/fine-tunes"
payload = {"training_file": file_id, "model": "ada"}
headers = {"Content-Type":"application/json", "Authorization":"Bearer <your api key>" \
}
r = requests.post(url, json=payload, headers=headers)
r.json()
check the status
import requests
import json
url = "https://api.openai.com/v1/fine-tunes"
headers = {"Authorization":"Bearer <your api key>"}
r = requests.get(url, headers=headers)
r.json()
model:“