I’m having trouble retrieving the YouTube video automatically. Here’s the code. The problem is the last part. download = urllib.request.urlopen(download_url).read()
# YouTube video download script
# 10n1z3d[at]w[dot]cn
import urllib.request
import sys
print("\n--------------------------")
print (" YouTube Video Downloader")
print ("--------------------------\n")
try:
video_url = sys.argv[1]
except:
video_url = input('[+] Enter video URL: ')
print("[+] Connecting...")
try:
if(video_url.endswith('&feature=related')):
video_id = video_url.split('www.youtube.com/watch?v=')[1].split('&feature=related')[0]
elif(video_url.endswith('&feature=dir')):
video_id = video_url.split('www.youtube.com/watch?v=')[1].split('&feature=dir')[0]
elif(video_url.endswith('&feature=fvst')):
video_id = video_url.split('www.youtube.com/watch?v=')[1].split('&feature=fvst')[0]
elif(video_url.endswith('&feature=channel_page')):
video_id = video_url.split('www.youtube.com/watch?v=')[1].split('&feature=channel_page')[0]
else:
video_id = video_url.split('www.youtube.com/watch?v=')[1]
except:
print("[-] Invalid URL.")
exit(1)
print("[+] Parsing token...")
try:
url = str(urllib.request.urlopen('http://www.youtube.com/get_video_info?&video_id=' + video_id).read())
token_value = url.split('video_id=' + video_id + '&token=')[1].split('&thumbnail_url')[0]
download_url = "http://www.youtube.com/get_video?video_id=" + video_id + "&t=" + token_value + "&fmt=18"
except:
url = str(urllib.request.urlopen('www.youtube.com/watch?v=' + video_id))
exit(1)
v_url = str(urllib.request.urlopen('http://' + video_url).read())
video_title = v_url.split('"rv.2.title": "')[1].split('", "rv.4.rating"')[0]
if '"' in video_title:
video_title = video_title.replace('"', '"')
elif '&' in video_title:
video_title = video_title.replace('&', '&')
print("[+] Downloading " + '"' + video_title + '"...')
try:
print(download_url)
file = open(video_title + '.mp4', 'wb')
download = urllib.request.urlopen(download_url).read()
print(download)
for line in download:
file.write(line)
file.close()
except:
print("[-] Error downloading. Quitting.")
exit(1)
print("\n[+] Done. The video is saved to the current working directory(cwd).\n")
There’s an error message (thanks Wooble):
Traceback (most recent call last):
File "C:/Python31/MyLib/DrawingBoard/youtube_download-.py", line 52, in <module>
download = urllib.request.urlopen(download_url).read()
File "C:\Python31\lib\urllib\request.py", line 119, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python31\lib\urllib\request.py", line 353, in open
response = meth(req, response)
File "C:\Python31\lib\urllib\request.py", line 465, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python31\lib\urllib\request.py", line 385, in error
result = self._call_chain(*args)
File "C:\Python31\lib\urllib\request.py", line 325, in _call_chain
r开发者_StackOverflow中文版esult = func(*args)
File "C:\Python31\lib\urllib\request.py", line 560, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "C:\Python31\lib\urllib\request.py", line 353, in open
response = meth(req, response)
File "C:\Python31\lib\urllib\request.py", line 465, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python31\lib\urllib\request.py", line 391, in error
return self._call_chain(*args)
File "C:\Python31\lib\urllib\request.py", line 325, in _call_chain
result = func(*args)
File "C:\Python31\lib\urllib\request.py", line 473, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
The code on the original question relies on several assumptions about the content of YouTube pages and URLs (expressed in constructs such as "url.split('something=')[1]") which may not always be true. I tested it and it might depend even on which related videos show on the page. You might have tripped on any of those specificities.
Here's a cleaner version, which uses urllib to parse URLs and query strings, and which successfully downloads a video. I've removed some of the try/except which didn't do much but exit, for clarity. Incidentally, it deals with Unicode video titles by removing non-ASCII characters from the filename to which the video is saved. It also takes any numbers of YouTube URLs and downloads them all. Finally, it masks its user-agent as Chrome for Mac (which is what I currently use).
#!/usr/bin/env python3
import sys
import urllib.request
from urllib.request import urlopen, FancyURLopener
from urllib.parse import urlparse, parse_qs, unquote
class UndercoverURLopener(FancyURLopener):
version = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.9 Safari/533.2"
urllib.request._urlopener = UndercoverURLopener()
def youtube_download(video_url):
video_id = parse_qs(urlparse(video_url).query)['v'][0]
url_data = urlopen('http://www.youtube.com/get_video_info?&video_id=' + video_id).read()
url_info = parse_qs(unquote(url_data.decode('utf-8')))
token_value = url_info['token'][0]
download_url = "http://www.youtube.com/get_video?video_id={0}&t={1}&fmt=18".format(
video_id, token_value)
video_title = url_info['title'][0] if 'title' in url_info else ''
# Unicode filenames are more trouble than they're worth
filename = video_title.encode('ascii', 'ignore').decode('ascii').replace("/", "-") + '.mp4'
print("\t Downloading '{}' to '{}'...".format(video_title, filename))
try:
download = urlopen(download_url).read()
f = open(filename, 'wb')
f.write(download)
f.close()
except Exception as e:
print("\t Download failed! {}".format(str(e)))
print("\t Skipping...")
else:
print("\t Done.")
def main():
print("\n--------------------------")
print (" YouTube Video Downloader")
print ("--------------------------\n")
try:
video_urls = sys.argv[1:]
except:
video_urls = input('Enter (space-separated) video URLs: ')
for u in video_urls:
youtube_download(u)
print("\n Done.")
if __name__ == '__main__':
main()
I'm going to shamelessly plug my script which automates checking for valid formats, automatically choosing the best quality format for a video, and works on both the Flash and HTML5 variants of YouTube pages (as well as Vimeo).
If you wrote that script then please look at my source code for inspiration and feel free to steal some code. I challenge you to please write something better. Open source thrives on competition!
However, if you copied that script and are just trying to get it working, may I suggest you give my script a try and see if it fares better for you. You can access it both from the command line as a script or even as a module in another Python file.
You may also check youtube-dl which is written in Python and check how it's written.
It looks like YouTube guys have changed algorithms for accessing video files. Instead of "token" they now use "signature" variable, and "signature" seems to be dependent on either cookie-stored data or IP address of the client (in case of cookies-disabled browser like urllib in Python 2). Here's a hack I've come up with (URLs are IP address-locked):
#!/usr/bin/python
import re
from urlparse import *
from urllib import *
def yt_url(video_url):
video_id = parse_qs(urlparse(video_url).query)['v'][0]
get_vars = parse_qs(unquote(urlopen("http://www.youtube.com/get_video_info?video_id="+video_id).read()))
url = get_vars["id"][0].split(",")[1].split("|")[1]
elements = dict()
elements["itag"] = get_vars["itag"][0]
elements["sver"] = get_vars["sver"][0]
elements["expire"] = get_vars["expire"][0]
elements["signature"] = get_vars["signature"][0]
elements["factor"] = get_vars["factor"][0]
elements["id"] = get_vars["id"][0].split(",")[0]
elements["key"] = get_vars["key"][0]
elements["burst"] = get_vars["burst"][0]
elements["sparams"] = get_vars["sparams"][0]
elements["algorithm"] = get_vars["algorithm"][0]
elements["ipbits"] = "8"
for get_var in elements:
url += "&" + get_var + "=" + elements[get_var]
return (get_vars["title"][0], url)
if __name__ == '__main__':
(title, url) = yt_url("http://www.youtube.com/watch?v=4tAr7tuakt0")
print "Title: %s" % (title,)
print "Video: %s" % (url,)
#!/usr/bin/env python
import urllib2, urllib
import re
import os
import sys
import time
linkurl =raw_input('Enter URL:')
linkurl1 = urllib.urlopen(linkurl).read()
file1 = open("index.html", "w")
file1.write(linkurl1)
file1.close()
fname = 'index.html'
## Giving new matrix value to find
find = ("yt.playerConfig =", '"title":')
## File reading programme
with open(fname) as infile:
for line_no, line in enumerate(infile, 1):
lline = line.lower()
if any(word.lower() in lline for word in find):
y = line.rstrip()
fileurl = y
y1 = y.replace("%3A%2F%2F", "://")
y2 = y1.replace("%2F", "/")
y3 = y2.replace("%3F", "?")
y4 = y3.replace("%3D", "=")
y5 = y4.replace("%26", "&")
y6 = y5.replace("%252", "%2")
y7 = y6.replace("sig", "&signature")
# Display video resolution information
print ""
print "Video resolution: "
print "[46=1080(.webm)]--[37=1080(.mp4)]--[35=480(.flv)]--[36=180(.3gpp)]"
print "[45=720(.webm) ]--[22=720(.mp4) ]--[34=360(.flv)]--[17=144(.3gpp)]"
print "[44=480(.webm) ]--[18=360(.mp4) ]--[5=240(.flv) ]"
print "[43=360(.webm) ]"
print ""
# Programme to get all itag list file
itag = re.findall('itag=(\d+)', y)
print `"itag list= "` + `itag`
resol = raw_input("Type itag number: ")
# Programme to get filename file
fname = 'index.html'
find = (' <title>', '</title>')
with open(fname) as infile:
for line_no, line in enumerate(infile, 1):
lline = line.lower()
if any(word.lower() in lline for word in find):
y = line.rstrip()
fileurl1 = y.split(">")[-2]
filename2 = fileurl1.split('"')[-2]
if resol == '46':
# Programme to get WebM file in 1080 HD
y1080_webm = re.findall(r'itag=46(.*?)\u0026quality=hd1080', y7)
url_1080_webm1 = re.findall(r'\\u0026url=(.*?)\\u0026type', `y1080_webm`)
signature = re.findall(r'signature=(.*?)\\', `y1080_webm`)
url_1080_webm2 = `url_1080_webm1`.split("\\")[0]
url_1080_webm = url_1080_webm2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_1080_webm
#print url_1080_webm
ext = ".webm"
elif resol == '37':
# Programme to get MP4 file in 1080 HD
y1080_mp4 = re.findall(r'itag=37(.*?)\u0026quality=hd1080', y7)
url_1080_mp41 = re.findall(r'\\u0026url=(.*?)\\u0026type', `y1080_mp4`)
signature = re.findall(r'signature=(.*?)\\', `y1080_mp4`)
url_1080_mp42 = `url_1080_mp41`.split("\\")[0]
url_1080_mp4 = url_1080_mp42.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_1080_mp4
#print url_1080_mp4
ext = ".mp4"
elif resol == '45':
# Programme to get WebM file in 720 HD
y720_webm = re.findall(r'itag=45(.*?)\u0026quality=hd720', y7)
url_720_webm1 = re.findall(r'\\u0026url=(.*?)\\u0026type', `y720_webm`)
signature = re.findall(r'signature=(.*?)\\', `y720_webm`)
url_720_webm2 = `url_720_webm1`.split("\\")[0]
url_720_webm = url_720_webm2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_720_webm
#print url_720_webm
ext = ".webm"
elif resol == '22':
# Programme to get MP4 file in 720 HD
y720_mp4 = re.findall(r'itag=22(.*?)\u0026quality=hd720', y7)
url_720_mp41 = re.findall(r'\\u0026url=(.*?)\\u0026type', `y720_mp4`)
signature = re.findall(r'signature=(.*?)\\', `y720_mp4`)
url_720_mp42 = `url_720_mp41`.split("\\")[0]
url_720_mp4 = url_720_mp42.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_720_mp4
#print url_720_mp4
ext = ".mp4"
elif resol == '44':
# Programme to get WebM file in 480 large
y480_webm = re.findall(r'itag=44(.*?)\u0026quality=large', y7)
url_480_webm1 = re.findall(r'\\u0026url=(.*?)\\u0026type', `y480_webm`)
signature = re.findall(r'signature=(.*?)\\', `y480_webm`)
url_480_webm2 = `url_480_webm1`.split("\\")[0]
url_480_webm = url_480_webm2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_480_webm
#print url_480_webm
ext = ".webm"
elif resol == '35':
# Programme to get a FLV file in 480 large
y480_flv = re.findall(r'itag=35(.*?)\u0026quality=large', y7)
url_480_flv1 = re.findall(r'\\u0026url=(.*?)\\', `y480_flv`)
signature = re.findall(r'signature=(.*?)\\', `y480_flv`)
url_480_flv2 = `url_480_flv1`.split("\\")[0]
url_480_flv = url_480_flv2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_480_flv
#print url_480_flv
ext = ".flv"
elif resol == '43':
# Programme to get WebM file in 360 medium
y360_webm = re.findall(r'itag=43(.*?)\u0026quality=medium', y7)
url_360_webm1 = re.findall(r'\\u0026url=(.*?)\\', `y360_webm`)
signature = re.findall(r'signature=(.*?)\\', `y360_webm`)
url_360_webm2 = `url_360_webm1`.split("\\")[0]
url_360_webm = url_360_webm2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_360_webm
#print url_360_webm
ext = ".webm"
elif resol == '34':
# Programme to get FLV file in 360 medium
y360_flv = re.findall(r'itag=34(.*?)\u0026quality=medium', y7)
url_360_flv1 = re.findall(r'\\u0026url=(.*?)\\', `y360_flv`)
signature = re.findall(r'signature=(.*?)\\', `y360_flv`)
url_360_flv2 = `url_360_flv1`.split("\\")[0]
url_360_flv = url_360_flv2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_360_flv
#print url_360_flv
ext = ".flv"
elif resol == '18':
# Programme to get MP4 file in 360 medium
y360_mp4 = re.findall(r'itag=18(.*?)\u0026quality=medium', y7)
url_360_mp41 = re.findall(r'\\u0026url=(.*?)\\', `y360_mp4`)
signature = re.findall(r'signature=(.*?)\\', `y360_mp4`)
url_360_mp42 = `url_360_mp41`.split("\\")[0]
url_360_mp4 = url_360_mp42.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_360_mp4
#print url_360_mp4
ext = ".mp4"
elif resol == '5':
# Programme to get FLV file in 240 small
y240_flv = re.findall(r'itag=5(.*?)\u0026quality=small', y7)
url_240_flv1 = re.findall(r'\\u0026url=(.*?)\\', `y240_flv`)
signature = re.findall(r'signature=(.*?)\\', `y240_flv`)
url_240_flv2 = `url_240_flv1`.split("\\")[0]
url_240_flv = url_240_flv2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_240_flv
#print url_240_flv
ext = ".flv"
elif resol == '36':
# Programme to get 3gpp file in 180 small
y180_3gpp = re.findall(r'itag=36(.*?)\u0026quality=small', y7)
url_180_3gpp1 = re.findall(r'\\u0026url=(.*?)\\', `y180_3gpp`)
signature = re.findall(r'signature=(.*?)\\', `y180_3gpp`)
url_180_3gpp2 = `url_180_3gpp1`.split("\\")[0]
url_180_3gpp = url_180_3gpp2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_180_3gpp
#print url_180_3gpp
ext = ".3gpp"
elif resol == '17':
# Programme to get 3gpp file in 144 small
y144_3gpp = re.findall(r'itag=17(.*?)\u0026quality=small', y7)
url_144_3gpp1 = re.findall(r'\\u0026url=(.*?)\\', `y144_3gpp`)
signature = re.findall(r'signature=(.*?)\\', `y144_3gpp`)
url_144_3gpp2 = `url_144_3gpp1`.split("\\")[0]
url_144_3gpp = url_144_3gpp2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_144_3gpp
#print url_144_3gpp
ext = ".3gpp"
#newindex = open("index1.txt", 'w')
#newindex.write(y7)
print url
filename = filename2 + ext
print filename
req = urllib2.Request(url, headers={'Range': "bytes=0-838860800"})
data = urllib2.urlopen(req)
print "connected to ""http://"+url.split("/")[2] + "/"
f = open(filename,'wb')
meta_data = data.info()
file_size = int(meta_data.getheaders("Content-Length")[0])
print "filesize= " + `file_size/1048576` + " MB"
bytes_received = 0
chunk_size = 10240
while True:
start_time = time.time()
buffer = data.read(chunk_size)
if not buffer:
break
bytes_received += len(buffer)
f.write(buffer)
Td = time.time() - start_time
speed1 = round(len(buffer)/1024.0, 1)
speed = round(speed1/Td, 1)
speed_MB = round(speed/1024.0, 1)
speed_GB = round(speed_MB/1024.0, 1)
bytes_received_MB = round(bytes_received/1048576.0, 3)
percent = bytes_received * 100. / file_size
if speed < 1:
speed_byte = round(len(buffer)/Td, 1)
Tr = (file_size-bytes_received)/(60*speed_byte)
status = r"[Downloaded=%.3f MB] [%3.2f%%] [speed= %.1f B/s] [eta %1d min] " % (bytes_received_MB, percent, speed_byte, Tr)
elif speed < 1024:
Tr = (file_size-bytes_received)/(60*1024*speed)
status = r"[Downloaded=%.3f MB] [%3.2f%%] [speed= %.1f KB/s] [eta %1d min] " % (bytes_received_MB, percent, speed, Tr)
elif speed < 1048576:
Tr = (file_size-bytes_received)/(60*1024*1024*speed_MB)
status = r"[Downloaded=%.3f MB] [%3.2f%%] [speed= %.1f MB/s] [eta %1d min] " % (bytes_received_MB, percent, speed_MB, Tr)
else:
Tr = (file_size-bytes_received)/(60*1024*1024*1024*speed_GB)
status = r"[Downloaded=%.3f MB] [%3.2f%%] [speed= %.1f GB/s] [eta %1d min] " % (bytes_received_MB, percent, speed_GB, Tr)
status = status + chr(8) * (len(status) + 1)
print status,
精彩评论