需求
现在拥有了一个121117条 id::::url 的final_hashs.txt文件
需要读取里面的url并下载对应的视频,有如下要求:
- 随时可手动停止下载,再次开始下载可继续上次任务
- 保存下载成功的视频id,保存下载失败(链接失效或下载超时)的视频id::::url
- 自定义下载路径
download_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89#可间断性地下载final_urls.txt的链接获得视频集
import urllib.request
def getLines(file_name): #获取文件行数
count = 0
thefile = open(file_name, encoding='utf8', errors='ignore')
while True:
buffer = thefile.read(1024 * 8192)
if not buffer:
break
count += buffer.count('\n')
thefile.close()
return count
dict_id_url = {}
f = open("dict_id_url.txt",encoding='utf8', errors='ignore')
a = f.read()
dict_id_url = eval(a)
f.close()
print("dict_id_url读取成功") #读取剩余urls
f_down_ok = open("down_ok.txt",'a',encoding='utf8', errors='ignore')
f_down_ok_this_time = open("down_ok_this_time.txt",encoding='utf8', errors='ignore')
while 1:
line_down_ok_this_time = f_down_ok_this_time.readline()
if not line_down_ok_this_time:
break
else:
line_down_ok_this_time = line_down_ok_this_time.replace('\n','')
del dict_id_url[str(line_down_ok_this_time)] #更新未下载urls
f_down_ok.write(str(line_down_ok_this_time)+'\n') #更新已下载urls
f_down_ok_this_time.close()
f_down_ok.close()
f_dict_id_url = open("dict_id_url.txt",'w',encoding='utf8', errors='ignore') #保存未下载的urls
f_dict_id_url.write(str(dict_id_url))
f_dict_id_url.close()
print("dict_id_url清理已下载urls成功")
print("down_ok添加已下载urls成功")
print("共剩余"+str(len(dict_id_url))+'条未下载')
print("总共已下载"+str(getLines('down_ok.txt'))+'条视频')
f = open("down_ok_this_time.txt",'w',encoding='utf8', errors='ignore') #清空文本
f.close()
f_url_error = open("url_error.txt",'a',encoding='utf8', errors='ignore')
f_url_error_this_time = open("url_error_this_time.txt",encoding='utf8', errors='ignore')
while 1:
line_url_error_this_time = f_url_error_this_time.readline()
if not line_url_error_this_time:
break
else:
f_url_error.write(str(line_url_error_this_time)) #更新失效urls
f_url_error_this_time.close()
f_url_error.close()
print("url_error增加错误urls成功")
print("共有"+str(getLines('url_error.txt'))+'条视频链接错误')
f = open("url_error_this_time.txt",'w',encoding='utf8', errors='ignore') #清空文本
f.close()
print('---开始下载---')
flag = 1
for key in dict_id_url:
url = str(dict_id_url[key])
try:
if '.mp4?' in url:
video_name = str(key)+'.mp4' #视频保存路径
urllib.request.urlretrieve(url, video_name) #下载
f_down_ok_this_time = open("down_ok_this_time.txt", 'a', encoding='utf8', errors='ignore')
f_down_ok_this_time.write(str(key)+'\n')
f_down_ok_this_time.close()
print('。。。。。') #下载成功一条打一个标记
flag += 1
else:
f_url_error_this_time = open("url_error_this_time.txt", 'a', encoding='utf8', errors='ignore')
f_url_error_this_time.write(str(key)+'::::'+dict_id_url[key]+'\n')
f_url_error_this_time.close()
except (TimeoutError,urllib.error.URLError):
print('----url失效,下载超时----')
f_url_error_this_time = open("url_error_this_time.txt", 'a', encoding='utf8', errors='ignore')
f_url_error_this_time.write(str(key) + '::::' + dict_id_url[key] + '\n')
f_url_error_this_time.close()
print('本次已下载'+str(getLines('down_ok_this_time.txt'))+'条,本次下载url错误'+str(getLines('url_error_this_time.txt'))+'条')
print('已保存并跳过此条url,下载继续-->')
if(flag%1000==0):
print('本次已下载'+flag+'条视频')
print("****下载全部完成****")
小结
数据集与代码,发布在github.