vine数据集处理

问题描述

有一个从vine.co爬取下来的数据集,需要对视频描述中的HashTag进行提取与计数

筛选

首先筛选视频描述中拥有HashTag(以#开头的单词或短语)的条目

handle_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#得到带hash的视频描述与视频链接
#解决编码问题,忽略警告
f_video_text = open("video_text.txt",encoding='utf8', errors='ignore')
f_video_url = open("video_download_link.txt",encoding='utf8', errors='ignore')
#要写入的文件
f_video_text_hash = open("video_text_hash.txt",'w',encoding='utf8', errors='ignore')
f_video_url_hash = open("video_url_hash.txt",'w',encoding='utf8', errors='ignore')

e = '#' #以‘#’号标识
while 1:
#分别读取下一行
line_video_text = f_video_text.readline()
line_video_url = f_video_url.readline()
if not line_video_text:
break
#如果包含HashTag
if (e in line_video_text):
f_video_text_hash.write(line_video_text)
f_video_url_hash.write(line_video_url)

f_video_url.close()
f_video_text.close()
统计

遍历得到的文件,提取所有的HashTag并做词频统计

handle_hash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# 统计hash总数与词频
import collections
f_video_text_hash = open("video_text_hash.txt", encoding='utf8', errors='ignore')
f_handle_hash = open("handle_hash.txt", 'w', encoding='utf8', errors='ignore')

def subString1(template): # HashTag以‘#’开头,以空格或回车结尾
copy = False
finished = False
slotList = []
str = ""
for s in template:
if s == '#':
copy = True
elif s == ' ':
copy = False
finished = True
elif s == '\n':
copy = False
finished = True
elif copy:
str = str + s
if finished:
if str != "":
slotList.append(str)
str = ""
finished = False
return slotList

slotList = []
while 1:
line_video_text = f_video_text_hash.readline()
if not line_video_text:
break
else:
slotList += subString1(line_video_text) #将所有HashTag存进list

result = collections.Counter(slotList) #词频统计
#ss = str(result) #词频统计结果转换成字符串并存文件
# f_handle_hash.write(ss)
#print(result.most_common(100)) 查看统计结果前100名
for each in result.most_common():
ss = str(each) #词频统计结果转换成字符串并存文件
f_handle_hash.write(ss+'\n')
#print(ss)
f_video_text_hash.close()
f_handle_hash.close()
数据集2

以上是处理数据集1:n条video:::text对应n条video:::url,顺序数量一一对应。后来又有了数据集2:n条video:::text对应m条video:::url,而且顺序不对应,两者间也不具备完全的包含关系。

筛选

首先是获得含有hashtag的数据集合video_text_hash.txt

get_video_text_hash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
#得到带hash的视频描述
f_video_text = open("location_videos_description",encoding='utf8', errors='ignore')
f_video_text_hash = open("video_text_hash.txt",'w',encoding='utf8', errors='ignore')

e = '#'
while 1:
line_video_text = f_video_text.readline()
if not line_video_text:
break
if (e in line_video_text):
f_video_text_hash.write(line_video_text)

f_video_text.close()
去重

然后将这个数据集2的video_text_hash.txt(复制成video_text_hash_2.txt)和数据集1的video_text_hash.txt(复制成video_text_hash_1.txt)比较,去除重复条目得到video_text_hash_3.txt,重复的条目存到video_cover.txt

get_video_text_hash_off_cover.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import re
#data1与data2的video_text_hash去重
f1_video_text = open("video_text_hash_1.txt",encoding='utf8', errors='ignore')
f2_video_text = open("video_text_hash_2.txt",encoding='utf8', errors='ignore')
f_video_text_hash = open("video_text_hash_3.txt",'w',encoding='utf8', errors='ignore')
f_video_text_cover = open("video_cover.txt",'w',encoding='utf8', errors='ignore')

def getText(template):
rule = r':::(.*?)\n'
slotList = re.findall(rule, template)
return slotList

def getId(template):
rule = r'(.*?):::'
slotList = re.findall(rule, template)
return slotList

list_1 = [] #用list存video_text_hash_1.txt的video的id
while 1:
line_video_text = f1_video_text.readline()
if not line_video_text:
break
else:
each_id = getId(line_video_text)
list_1.append(each_id[0])
#print(list_1) #data1的id list

dict_2 = {} #用dict存video_text_hash_2.txt的video的id:text
i=1
while 1:
line_video_text = f2_video_text.readline()
if not line_video_text:
break
else:
each_id = getId(line_video_text)
each_text = getText(line_video_text)
dict_2[each_id[0]] = each_text[0]
#print(dict_2) #data2的dict {id:text}

for key in dict_2:
if(key not in list_1):
f_video_text_hash.write(key + '::' + dict_2[key] + '\n')
else:
f_video_text_cover.write(key + '::' + dict_2[key] + '\n')
i+=1
if(i%5000==0):
print(i)

f1_video_text.close()
f2_video_text.close()
f_video_text_hash.close()
获得url

然后将这个数据集2的video_text_hash_3.txt(复制成video_text_hash_off_cover.txt)和数据集2的video_url.txt比较,找到缺失的url_id条目得到video_url_off.txt,清理的id_url存到video_url_hash.txt

get_video_url_hash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import re
#根据video_text_hash_off_cover.txt清洗video_url.txt
f1_video_text = open("video_text_hash_off_cover.txt",encoding='utf8', errors='ignore')
f2_video_text = open("video_url.txt",encoding='utf8', errors='ignore')
f_video_url_hash = open("video_url_hash.txt",'w',encoding='utf8', errors='ignore')
f_video_url_off = open("video_url_off.txt",'w',encoding='utf8', errors='ignore')

def getText(template):
rule = r':::(.*?)\n'
slotList = re.findall(rule, template)
return slotList

def getId(template):
rule = r'(.*?):::'
slotList = re.findall(rule, template)
return slotList

list_1 = []
while 1:
line_video_text = f1_video_text.readline()
if not line_video_text:
break
else:
each_id = getId(line_video_text)
list_1.append(each_id[0])
#print(list_1) #data1的id list

ii=1
dict_2 = {}
while 1:
line_video_text = f2_video_text.readline()
if not line_video_text:
break
else:
each_id = getId(line_video_text)
each_text = getText(line_video_text)
dict_2[each_id[0]] = each_text[0]
ii += 1
if (ii % 5000 == 0):
print(ii)

#print(dict_2) #data2的dict {id:text}
print("开始清洗")
ii=1
for i in list_1:
if(i in dict_2):
try:
f_video_url_hash.write(i + '::::' + dict_2[i] + '\n')
except IndexError:
print("c1")
else:
try:
f_video_url_off.write(i + '\n')
except IndexError:
print("c2")
ii+=1
if(ii%5000==0):
print(ii)

f1_video_text.close()
f2_video_text.close()
f_video_url_hash.close()
获得text

将video_text_hash_off_cover.txt依据video_url_off.txt清除缺失url的条目得到最终video_text_hash_4.txt

get_video_text_hash_clean_again.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import re
#将video_text_hash_off_cover.txt依据video_url_off.txt清除缺失url项
f1_video_text = open("video_url_off.txt",encoding='utf8', errors='ignore')
f2_video_text = open("video_text_hash_off_cover.txt",encoding='utf8', errors='ignore')
f_video_text_hash = open("video_text_hash_4.txt",'w',encoding='utf8', errors='ignore')

def getText(template):
rule = r':::(.*?)\n'
slotList = re.findall(rule, template)
return slotList

def getId(template):
rule = r'(.*?):::'
slotList = re.findall(rule, template)
return slotList

list_1 = []
while 1:
line_video_text = f1_video_text.readline()
if not line_video_text:
break
else:
#each_id = getId(line_video_text)
rule = r'(.*?)\n'
slotList = re.findall(rule, line_video_text)
list_1.append(slotList[0])
#print(list_1) #data1的id list

dict_2 = {}
list_2 = []
ii=1
while 1:
line_video_text = f2_video_text.readline()
if not line_video_text:
break
else:
each_id = getId(line_video_text)
each_text = getText(line_video_text)
dict_2[each_id[0]] = each_text[0]
list_2.append(each_id[0])
ii += 1
if (ii % 10000 == 0):
print(ii)
#print(list_2[0]) #data2的dict {id:text}

ii=0
for i in list_2:
if(i in list_1):
pass
else:
f_video_text_hash.write(i + ':::' + dict_2[i] + '\n')
ii += 1
if (ii % 10000 == 0):
print(ii)

f1_video_text.close()
f2_video_text.close()
f_video_text_hash.close()
整合

手动将数据集1得到的video:::text和video:::url文件(video_text_hash.txt,video_url_hash.txt),和数据集2得到的video:::text和video:::url文件(video_text_hash_4.txt,video_url_hash.txt)合并得到最终video_text_hash.txt和video_url_hash.txt。

小结

目前只处理了这两类数据,数据集中还包含了用户信息之类的其他信息,发布在github.