vine数据集处理4

需求&前提

多个文件夹的视频文件共121117-1862(下载失败数)= 119255 条
多个文件夹的分割后文件共119255 - 135(分割失败数)= 119120 条
现要对这119120条数据做hash词频统计,并截取词频不小于5的所有数据并清除不合法数据

求得分割后数据数:
遍历video文件夹建立list_id并去除extract_fail数据

get_all_videos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# 获取所有拆分成功的短视频id存入list_id.txt
import os


def file_name(file_dir): # 获取视频名list
for root, dirs, files in os.walk(file_dir):
continue
return files


path = '/home/caoda/Hodge_work_space/videos_data/videos_201712'
list_id = []
f_list_id = open("list_id.txt", encoding='utf8', errors='ignore')
a = f_list_id.read()
list_id = eval(a)
f_list_id.close()

print('读取list_id成功,现有' + str(len(list_id)) + '条数据')
list_path = ['07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '18', '19', '20', '21', '24']
for i in list_path:
list_id += file_name(path + i)
# print(path + str(i))

print('总视频数:' + str(len(list_id))) # 121117条数据-下载失败条数
# 去除提取失败与剩余条数

f_extract_fail = open("extract_fail", encoding='utf8', errors='ignore')
i = 0
while 1:
line_extract_fail = f_extract_fail.readline()
if not line_extract_fail:
break
else:
line_extract_fail = line_extract_fail.replace('\n', '')
if (line_extract_fail in list_id):
list_id.remove(line_extract_fail)
i += 1

print('清理' + str(i) + '条数据\n最后总数据' + str(len(list_id)))

f_list_id = open("list_id.txt", 'w', encoding='utf8', errors='ignore')
f_list_id.write(str(list_id))
f_list_id.close()

list_id.clear()
词频统计

根据list_id和121117长度的final_hashs.txt对最终数据做词频统计写入count_hash.txt

count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# 统计hash总数与词频
import collections
import re


def get_id(template):
rule = r'(.*?)::::'
slot_list = re.findall(rule, template)
return slot_list


def get_text(template):
rule = r'::::(.*?)\n'
slot_list = re.findall(rule, template)
return slot_list


def get_list_or_dict(path):
f = open(path, encoding='utf8', errors='ignore')
a = f.read()
dict_or_list = eval(a)
f.close()
return dict_or_list


list_id = get_list_or_dict('list_id.txt')
print('list_id长度' + str(len(list_id)))

f_id_hash_final = open("id_hash_final.txt", 'w', encoding='utf8', errors='ignore')

f = open("final_hashs.txt", encoding='utf8', errors='ignore')
dict_id_hash = {}
while 1:
line_txt = f.readline()
if not line_txt:
break
else:
dict_id_hash[get_id(line_txt)[0]] = get_text(line_txt)[0]
f.close()
print('dict_id_hash长度' + str(len(dict_id_hash))) # 121117
f_dict_id_hash = open("dict_id_hash.txt", 'w', encoding='utf8', errors='ignore')
f_dict_id_hash.write(str(dict_id_hash))
f_dict_id_hash.close()

for id in list_id:
id = id.replace('.mp4', '')

f_id_hash_final.write(id + '::::' + dict_id_hash[id] + '\n')
f_id_hash_final.close()

f_id_hash_final = open("id_hash_final.txt", encoding='utf8', errors='ignore')
f_count_hash = open("count_hash.txt", 'w', encoding='utf8', errors='ignore')


def subString1(template): # HashTag以‘#’开头,以空格或回车结尾
copy = False
finished = False
slotList = []
str = ""
for s in template:
if s == '#':
copy = True
elif s == ' ':
copy = False
finished = True
elif s == '\n':
copy = False
finished = True
elif copy:
str = str + s
if finished:
if str != "":
slotList.append(str)
str = ""
finished = False
return slotList


slotList = []
i = 1
# for i in range(0,3):
while 1:
line_id_hash = f_id_hash_final.readline()
if not line_id_hash:
break
else:
# print()
slotList += subString1(line_id_hash) # 将所有HashTag存进list
# print(slotList)

result = collections.Counter(slotList) # 词频统计
# ss = str(result) #词频统计结果转换成字符串并存文件
# f_handle_hash.write(ss)
# print(result.most_common(100)) 查看统计结果前100名
for each in result.most_common():
ss = str(each) # 词频统计结果转换成字符串并存文件
f_count_hash.write(ss + '\n')
# print(ss)
f_id_hash_final.close()
f_count_hash.close()
截取指定长度数据

根据词频统计截取hash词频不小于5的数据存入id_hash

count_num_hash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import re


def getText(template):
rule = r'::::(.*?)\n'
slotList = re.findall(rule, template)
return slotList


def get_list_or_dict(path):
f = open(path, encoding='utf8', errors='ignore')
a = f.read()
dict_or_list = eval(a)
f.close()
return dict_or_list


f_count_hash = open("count_hash.txt", encoding='utf8', errors='ignore')
# f_list_count_hash = open("list_count_hash.txt",'w',encoding='utf8', errors='ignore')
list_hash = []
flag_hash_line = 0
while 1:
line_hash = f_count_hash.readline()
if not line_hash:
break
else:
rule1 = r"'(.*?)'"
rule2 = r'"(.*?)"'
rule3 = r' (.*?)\)'
try:
slotList = re.findall(rule1, line_hash)
hash_num = re.findall(rule3, line_hash)
if int(hash_num[0]) == 4:
break
else:
list_hash.append(str(slotList[0]))
flag_hash_line += 1
except IndexError:
slotList = re.findall(rule2, line_hash)
list_hash.append(str(slotList[0]))
flag_hash_line += 1

print(flag_hash_line)

list_id = get_list_or_dict('list_id.txt')
print(len(list_id))
dict_id_hash = get_list_or_dict('dict_id_hash.txt')
print(len(dict_id_hash))

f_id_hash = open("id_hash", 'w', encoding='utf8', errors='ignore')
i = 0
for id in list_id:
id = str(id).replace('.mp4', '')
# print(id)
line_list_hash = dict_id_hash[id].split(' ')
# print(line_list_hash)
line_hash_new = ''
for hash in line_list_hash:
hash = str(hash).replace('#', '')
if hash in list_hash:
line_hash_new += '#' + hash + ' '
if line_hash_new != '':
# print('空:' + id)
# else:
f_id_hash.write(id+'::::'+line_hash_new+'\n')
i += 1
if (i % 10000 == 0):
print(i)
f_id_hash.close()
print(i)

最终处理获得118684长度的id_hash文件

清理数据

根据id_error文件从119120条分割后文件集中删除不合法数据

clean_extract_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import re
import os
import shutil

def get_id(template):
rule = r'(.*?)::::'
slot_list = re.findall(rule, template)
return slot_list


def get_list_or_dict(path):
f = open(path, encoding='utf8', errors='ignore')
a = f.read()
dict_or_list = eval(a)
f.close()
return dict_or_list

dict_id_num = get_list_or_dict('dict_id_num.txt')
print(len(dict_id_num))

list_id_error = []
f_id_error = open('id_error', encoding='utf8', errors='ignore')
while 1:
line_id_error = f_id_error.readline()
if not line_id_error:
break
else:
list_id_error.append(dict_id_num[line_id_error.replace('\n','')])
print(len(list_id_error))

path = '/home/caoda/Hodge_work_space/dataset/dataset'
ii = 0
for i in range(1, 16):
list_extract_num = os.listdir(path + str(i))
for num in list_extract_num:
if(num in list_id_error):
try:
shutil.rmtree(path+str(i)+'/'+num)
# print(path+str(i)+'/'+num)
except:
print('删除出错')
ii += len(os.listdir(path + str(i)))
print(ii)

到目前为止数据集已准备并分割完毕。
数据集与代码,发布在github.