vine数据集处理4

需求&前提

多个文件夹的视频文件共121117-1862（下载失败数）= 119255 条
多个文件夹的分割后文件共119255 - 135（分割失败数）= 119120 条
现要对这119120条数据做hash词频统计，并截取词频不小于5的所有数据并清除不合法数据

求得分割后数据数:
遍历video文件夹建立list_id并去除extract_fail数据

get_all_videos.py

# 获取所有拆分成功的短视频id存入list_id.txt
import os


def file_name(file_dir):  # 获取视频名list
    for root, dirs, files in os.walk(file_dir):
        continue
    return files


path = '/home/caoda/Hodge_work_space/videos_data/videos_201712'
list_id = []
f_list_id = open("list_id.txt", encoding='utf8', errors='ignore')
a = f_list_id.read()
list_id = eval(a)
f_list_id.close()

print('读取list_id成功，现有' + str(len(list_id)) + '条数据')
list_path = ['07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '18', '19', '20', '21', '24']
for i in list_path:
    list_id += file_name(path + i)
    # print(path + str(i))

print('总视频数：' + str(len(list_id)))  # 121117条数据-下载失败条数
# 去除提取失败与剩余条数

f_extract_fail = open("extract_fail", encoding='utf8', errors='ignore')
i = 0
while 1:
    line_extract_fail = f_extract_fail.readline()
    if not line_extract_fail:
        break
    else:
        line_extract_fail = line_extract_fail.replace('\n', '')
        if (line_extract_fail in list_id):
            list_id.remove(line_extract_fail)
            i += 1

print('清理' + str(i) + '条数据\n最后总数据' + str(len(list_id)))

f_list_id = open("list_id.txt", 'w', encoding='utf8', errors='ignore')
f_list_id.write(str(list_id))
f_list_id.close()

list_id.clear()

词频统计

根据list_id和121117长度的final_hashs.txt对最终数据做词频统计写入count_hash.txt

count.py

# 统计hash总数与词频
import collections
import re


def get_id(template):
    rule = r'(.*?)::::'
    slot_list = re.findall(rule, template)
    return slot_list


def get_text(template):
    rule = r'::::(.*?)\n'
    slot_list = re.findall(rule, template)
    return slot_list


def get_list_or_dict(path):
    f = open(path, encoding='utf8', errors='ignore')
    a = f.read()
    dict_or_list = eval(a)
    f.close()
    return dict_or_list


list_id = get_list_or_dict('list_id.txt')
print('list_id长度' + str(len(list_id)))

f_id_hash_final = open("id_hash_final.txt", 'w', encoding='utf8', errors='ignore')

f = open("final_hashs.txt", encoding='utf8', errors='ignore')
dict_id_hash = {}
while 1:
    line_txt = f.readline()
    if not line_txt:
        break
    else:
        dict_id_hash[get_id(line_txt)[0]] = get_text(line_txt)[0]
f.close()
print('dict_id_hash长度' + str(len(dict_id_hash)))  # 121117
f_dict_id_hash = open("dict_id_hash.txt", 'w', encoding='utf8', errors='ignore')
f_dict_id_hash.write(str(dict_id_hash))
f_dict_id_hash.close()

for id in list_id:
    id = id.replace('.mp4', '')

    f_id_hash_final.write(id + '::::' + dict_id_hash[id] + '\n')
f_id_hash_final.close()

f_id_hash_final = open("id_hash_final.txt", encoding='utf8', errors='ignore')
f_count_hash = open("count_hash.txt", 'w', encoding='utf8', errors='ignore')


def subString1(template):  # HashTag以‘#’开头，以空格或回车结尾
    copy = False
    finished = False
    slotList = []
    str = ""
    for s in template:
        if s == '#':
            copy = True
        elif s == ' ':
            copy = False
            finished = True
        elif s == '\n':
            copy = False
            finished = True
        elif copy:
            str = str + s
        if finished:
            if str != "":
                slotList.append(str)
            str = ""
            finished = False
    return slotList


slotList = []
i = 1
# for i in range(0,3):
while 1:
    line_id_hash = f_id_hash_final.readline()
    if not line_id_hash:
        break
    else:
        # print()
        slotList += subString1(line_id_hash)  # 将所有HashTag存进list
# print(slotList)

result = collections.Counter(slotList)  # 词频统计
# ss = str(result)    #词频统计结果转换成字符串并存文件
# f_handle_hash.write(ss)
# print(result.most_common(100)) 查看统计结果前100名
for each in result.most_common():
    ss = str(each)  # 词频统计结果转换成字符串并存文件
    f_count_hash.write(ss + '\n')
    # print(ss)
f_id_hash_final.close()
f_count_hash.close()

截取指定长度数据

根据词频统计截取hash词频不小于5的数据存入id_hash

count_num_hash.py

import re


def getText(template):
    rule = r'::::(.*?)\n'
    slotList = re.findall(rule, template)
    return slotList


def get_list_or_dict(path):
    f = open(path, encoding='utf8', errors='ignore')
    a = f.read()
    dict_or_list = eval(a)
    f.close()
    return dict_or_list


f_count_hash = open("count_hash.txt", encoding='utf8', errors='ignore')
# f_list_count_hash = open("list_count_hash.txt",'w',encoding='utf8', errors='ignore')
list_hash = []
flag_hash_line = 0
while 1:
    line_hash = f_count_hash.readline()
    if not line_hash:
        break
    else:
        rule1 = r"'(.*?)'"
        rule2 = r'"(.*?)"'
        rule3 = r' (.*?)\)'
        try:
            slotList = re.findall(rule1, line_hash)
            hash_num = re.findall(rule3, line_hash)
            if int(hash_num[0]) == 4:
                break
            else:
                list_hash.append(str(slotList[0]))
                flag_hash_line += 1
        except IndexError:
            slotList = re.findall(rule2, line_hash)
            list_hash.append(str(slotList[0]))
            flag_hash_line += 1

print(flag_hash_line)

list_id = get_list_or_dict('list_id.txt')
print(len(list_id))
dict_id_hash = get_list_or_dict('dict_id_hash.txt')
print(len(dict_id_hash))

f_id_hash = open("id_hash", 'w', encoding='utf8', errors='ignore')
i = 0
for id in list_id:
    id = str(id).replace('.mp4', '')
    # print(id)
    line_list_hash = dict_id_hash[id].split(' ')
    # print(line_list_hash)
    line_hash_new = ''
    for hash in line_list_hash:
        hash = str(hash).replace('#', '')
        if hash in list_hash:
            line_hash_new += '#' + hash + ' '
    if line_hash_new != '':
    #     print('空：' + id)
    # else:
        f_id_hash.write(id+'::::'+line_hash_new+'\n')
        i += 1
    if (i % 10000 == 0):
        print(i)
f_id_hash.close()
print(i)

最终处理获得118684长度的id_hash文件

清理数据

根据id_error文件从119120条分割后文件集中删除不合法数据

clean_extract_data.py

import re
import os
import shutil

def get_id(template):
    rule = r'(.*?)::::'
    slot_list = re.findall(rule, template)
    return slot_list


def get_list_or_dict(path):
    f = open(path, encoding='utf8', errors='ignore')
    a = f.read()
    dict_or_list = eval(a)
    f.close()
    return dict_or_list

dict_id_num = get_list_or_dict('dict_id_num.txt')
print(len(dict_id_num))

list_id_error = []
f_id_error = open('id_error', encoding='utf8', errors='ignore')
while 1:
    line_id_error = f_id_error.readline()
    if not line_id_error:
        break
    else:
        list_id_error.append(dict_id_num[line_id_error.replace('\n','')])
print(len(list_id_error))

path = '/home/caoda/Hodge_work_space/dataset/dataset'
ii = 0
for i in range(1, 16):
    list_extract_num = os.listdir(path + str(i))
    for num in list_extract_num:
        if(num in list_id_error):
            try:
                shutil.rmtree(path+str(i)+'/'+num)
                # print(path+str(i)+'/'+num)
            except:
                print('删除出错')
    ii += len(os.listdir(path + str(i)))
print(ii)