vine数据集处理3

需求

现在拥有了一个121117条 id::::url 的final_hashs.txt文件
需要读取里面的url并下载对应的视频,有如下要求:

  1. 随时可手动停止下载,再次开始下载可继续上次任务
  2. 保存下载成功的视频id,保存下载失败(链接失效或下载超时)的视频id::::url
  3. 自定义下载路径
    download_2.py
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    #可间断性地下载final_urls.txt的链接获得视频集
    import urllib.request

    def getLines(file_name): #获取文件行数
    count = 0
    thefile = open(file_name, encoding='utf8', errors='ignore')
    while True:
    buffer = thefile.read(1024 * 8192)
    if not buffer:
    break
    count += buffer.count('\n')
    thefile.close()
    return count

    dict_id_url = {}
    f = open("dict_id_url.txt",encoding='utf8', errors='ignore')
    a = f.read()
    dict_id_url = eval(a)
    f.close()
    print("dict_id_url读取成功") #读取剩余urls

    f_down_ok = open("down_ok.txt",'a',encoding='utf8', errors='ignore')
    f_down_ok_this_time = open("down_ok_this_time.txt",encoding='utf8', errors='ignore')
    while 1:
    line_down_ok_this_time = f_down_ok_this_time.readline()
    if not line_down_ok_this_time:
    break
    else:
    line_down_ok_this_time = line_down_ok_this_time.replace('\n','')
    del dict_id_url[str(line_down_ok_this_time)] #更新未下载urls
    f_down_ok.write(str(line_down_ok_this_time)+'\n') #更新已下载urls
    f_down_ok_this_time.close()
    f_down_ok.close()
    f_dict_id_url = open("dict_id_url.txt",'w',encoding='utf8', errors='ignore') #保存未下载的urls
    f_dict_id_url.write(str(dict_id_url))
    f_dict_id_url.close()
    print("dict_id_url清理已下载urls成功")
    print("down_ok添加已下载urls成功")

    print("共剩余"+str(len(dict_id_url))+'条未下载')
    print("总共已下载"+str(getLines('down_ok.txt'))+'条视频')

    f = open("down_ok_this_time.txt",'w',encoding='utf8', errors='ignore') #清空文本
    f.close()

    f_url_error = open("url_error.txt",'a',encoding='utf8', errors='ignore')
    f_url_error_this_time = open("url_error_this_time.txt",encoding='utf8', errors='ignore')
    while 1:
    line_url_error_this_time = f_url_error_this_time.readline()
    if not line_url_error_this_time:
    break
    else:
    f_url_error.write(str(line_url_error_this_time)) #更新失效urls
    f_url_error_this_time.close()
    f_url_error.close()

    print("url_error增加错误urls成功")
    print("共有"+str(getLines('url_error.txt'))+'条视频链接错误')

    f = open("url_error_this_time.txt",'w',encoding='utf8', errors='ignore') #清空文本
    f.close()

    print('---开始下载---')
    flag = 1
    for key in dict_id_url:
    url = str(dict_id_url[key])
    try:
    if '.mp4?' in url:
    video_name = str(key)+'.mp4' #视频保存路径
    urllib.request.urlretrieve(url, video_name) #下载
    f_down_ok_this_time = open("down_ok_this_time.txt", 'a', encoding='utf8', errors='ignore')
    f_down_ok_this_time.write(str(key)+'\n')
    f_down_ok_this_time.close()
    print('。。。。。') #下载成功一条打一个标记
    flag += 1
    else:
    f_url_error_this_time = open("url_error_this_time.txt", 'a', encoding='utf8', errors='ignore')
    f_url_error_this_time.write(str(key)+'::::'+dict_id_url[key]+'\n')
    f_url_error_this_time.close()
    except (TimeoutError,urllib.error.URLError):
    print('----url失效,下载超时----')
    f_url_error_this_time = open("url_error_this_time.txt", 'a', encoding='utf8', errors='ignore')
    f_url_error_this_time.write(str(key) + '::::' + dict_id_url[key] + '\n')
    f_url_error_this_time.close()
    print('本次已下载'+str(getLines('down_ok_this_time.txt'))+'条,本次下载url错误'+str(getLines('url_error_this_time.txt'))+'条')
    print('已保存并跳过此条url,下载继续-->')
    if(flag%1000==0):
    print('本次已下载'+flag+'条视频')
    print("****下载全部完成****")
小结

数据集与代码,发布在github.