vine数据集处理

问题描述

有一个从vine.co爬取下来的数据集，需要对视频描述中的HashTag进行提取与计数

筛选

首先筛选视频描述中拥有HashTag（以#开头的单词或短语）的条目

handle_data.py

#得到带hash的视频描述与视频链接
#解决编码问题，忽略警告
f_video_text = open("video_text.txt",encoding='utf8', errors='ignore')
f_video_url = open("video_download_link.txt",encoding='utf8', errors='ignore')
#要写入的文件
f_video_text_hash = open("video_text_hash.txt",'w',encoding='utf8', errors='ignore')
f_video_url_hash = open("video_url_hash.txt",'w',encoding='utf8', errors='ignore')

e = '#'         #以‘#’号标识
while 1:
#分别读取下一行
    line_video_text = f_video_text.readline()
    line_video_url = f_video_url.readline()
    if not line_video_text:
        break
#如果包含HashTag
    if (e in line_video_text):  
        f_video_text_hash.write(line_video_text)
        f_video_url_hash.write(line_video_url)

f_video_url.close()
f_video_text.close()

统计

遍历得到的文件，提取所有的HashTag并做词频统计

handle_hash.py

# 统计hash总数与词频
import collections
f_video_text_hash = open("video_text_hash.txt", encoding='utf8', errors='ignore')
f_handle_hash = open("handle_hash.txt", 'w', encoding='utf8', errors='ignore')

def subString1(template):  # HashTag以‘#’开头，以空格或回车结尾
    copy = False
    finished = False
    slotList = []
    str = ""
    for s in template:
        if s == '#':
            copy = True
        elif s == ' ':
            copy = False
            finished = True
        elif s == '\n':
            copy = False
            finished = True
        elif copy:
            str = str + s
        if finished:
            if str != "":
                slotList.append(str)
            str = ""
            finished = False
    return slotList

slotList = []
while 1:
    line_video_text = f_video_text_hash.readline()
    if not line_video_text:
        break
    else:
        slotList += subString1(line_video_text) #将所有HashTag存进list

result = collections.Counter(slotList)  #词频统计
#ss = str(result)    #词频统计结果转换成字符串并存文件
# f_handle_hash.write(ss)
#print(result.most_common(100)) 查看统计结果前100名
for each in result.most_common():
    ss = str(each)    #词频统计结果转换成字符串并存文件
    f_handle_hash.write(ss+'\n')
    #print(ss)
f_video_text_hash.close()
f_handle_hash.close()

数据集2

以上是处理数据集1：n条video:::text对应n条video:::url，顺序数量一一对应。后来又有了数据集2：n条video:::text对应m条video:::url,而且顺序不对应，两者间也不具备完全的包含关系。

筛选

首先是获得含有hashtag的数据集合video_text_hash.txt

get_video_text_hash.py

#得到带hash的视频描述
f_video_text = open("location_videos_description",encoding='utf8', errors='ignore')
f_video_text_hash = open("video_text_hash.txt",'w',encoding='utf8', errors='ignore')

e = '#'
while 1:
    line_video_text = f_video_text.readline()
    if not line_video_text:
        break
    if (e in line_video_text):
        f_video_text_hash.write(line_video_text)

f_video_text.close()

去重

然后将这个数据集2的video_text_hash.txt（复制成video_text_hash_2.txt）和数据集1的video_text_hash.txt（复制成video_text_hash_1.txt）比较，去除重复条目得到video_text_hash_3.txt，重复的条目存到video_cover.txt

get_video_text_hash_off_cover.py

import re
#data1与data2的video_text_hash去重
f1_video_text = open("video_text_hash_1.txt",encoding='utf8', errors='ignore')
f2_video_text = open("video_text_hash_2.txt",encoding='utf8', errors='ignore')
f_video_text_hash = open("video_text_hash_3.txt",'w',encoding='utf8', errors='ignore')
f_video_text_cover = open("video_cover.txt",'w',encoding='utf8', errors='ignore')

def getText(template):
    rule = r':::(.*?)\n'
    slotList = re.findall(rule, template)
    return slotList

def getId(template):
    rule = r'(.*?):::'
    slotList = re.findall(rule, template)
    return slotList

list_1 = []     #用list存video_text_hash_1.txt的video的id
while 1:
    line_video_text = f1_video_text.readline()
    if not line_video_text:
        break
    else:
        each_id = getId(line_video_text)
        list_1.append(each_id[0])
#print(list_1)      #data1的id  list

dict_2 = {}     #用dict存video_text_hash_2.txt的video的id：text
i=1
while 1:
    line_video_text = f2_video_text.readline()
    if not line_video_text:
        break
    else:
        each_id = getId(line_video_text)
        each_text = getText(line_video_text)
        dict_2[each_id[0]] = each_text[0]
#print(dict_2)     #data2的dict {id:text}

for key in dict_2:
    if(key not in list_1):
        f_video_text_hash.write(key + '::' + dict_2[key] + '\n')
    else:
        f_video_text_cover.write(key + '::' + dict_2[key] + '\n')
    i+=1
    if(i%5000==0):
        print(i)

f1_video_text.close()
f2_video_text.close()
f_video_text_hash.close()

获得url

然后将这个数据集2的video_text_hash_3.txt（复制成video_text_hash_off_cover.txt）和数据集2的video_url.txt比较，找到缺失的url_id条目得到video_url_off.txt，清理的id_url存到video_url_hash.txt

get_video_url_hash.py

import re
#根据video_text_hash_off_cover.txt清洗video_url.txt
f1_video_text = open("video_text_hash_off_cover.txt",encoding='utf8', errors='ignore')
f2_video_text = open("video_url.txt",encoding='utf8', errors='ignore')
f_video_url_hash = open("video_url_hash.txt",'w',encoding='utf8', errors='ignore')
f_video_url_off = open("video_url_off.txt",'w',encoding='utf8', errors='ignore')

def getText(template):
    rule = r':::(.*?)\n'
    slotList = re.findall(rule, template)
    return slotList

def getId(template):
    rule = r'(.*?):::'
    slotList = re.findall(rule, template)
    return slotList

list_1 = []
while 1:
    line_video_text = f1_video_text.readline()
    if not line_video_text:
        break
    else:
        each_id = getId(line_video_text)
        list_1.append(each_id[0])
#print(list_1)      #data1的id  list

ii=1
dict_2 = {}
while 1:
    line_video_text = f2_video_text.readline()
    if not line_video_text:
        break
    else:
        each_id = getId(line_video_text)
        each_text = getText(line_video_text)
        dict_2[each_id[0]] = each_text[0]
        ii += 1
        if (ii % 5000 == 0):
            print(ii)

#print(dict_2)     #data2的dict {id:text}
print("开始清洗")
ii=1
for i in list_1:
    if(i in dict_2):
        try:
            f_video_url_hash.write(i + '::::' + dict_2[i] + '\n')
        except IndexError:
            print("c1")
    else:
        try:
            f_video_url_off.write(i + '\n')
        except IndexError:
            print("c2")
    ii+=1
    if(ii%5000==0):
        print(ii)

f1_video_text.close()
f2_video_text.close()
f_video_url_hash.close()

获得text

将video_text_hash_off_cover.txt依据video_url_off.txt清除缺失url的条目得到最终video_text_hash_4.txt

get_video_text_hash_clean_again.py

import re
#将video_text_hash_off_cover.txt依据video_url_off.txt清除缺失url项
f1_video_text = open("video_url_off.txt",encoding='utf8', errors='ignore')
f2_video_text = open("video_text_hash_off_cover.txt",encoding='utf8', errors='ignore')
f_video_text_hash = open("video_text_hash_4.txt",'w',encoding='utf8', errors='ignore')

def getText(template):
    rule = r':::(.*?)\n'
    slotList = re.findall(rule, template)
    return slotList

def getId(template):
    rule = r'(.*?):::'
    slotList = re.findall(rule, template)
    return slotList

list_1 = []
while 1:
    line_video_text = f1_video_text.readline()
    if not line_video_text:
        break
    else:
        #each_id = getId(line_video_text)
        rule = r'(.*?)\n'
        slotList = re.findall(rule, line_video_text)
        list_1.append(slotList[0])
#print(list_1)      #data1的id  list

dict_2 = {}
list_2 = []
ii=1
while 1:
    line_video_text = f2_video_text.readline()
    if not line_video_text:
        break
    else:
        each_id = getId(line_video_text)
        each_text = getText(line_video_text)
        dict_2[each_id[0]] = each_text[0]
        list_2.append(each_id[0])
    ii += 1
    if (ii % 10000 == 0):
        print(ii)
#print(list_2[0])     #data2的dict {id:text}

ii=0
for i in list_2:
    if(i in list_1):
        pass
    else:
        f_video_text_hash.write(i + ':::' + dict_2[i] + '\n')
    ii += 1
    if (ii % 10000 == 0):
        print(ii)

f1_video_text.close()
f2_video_text.close()
f_video_text_hash.close()

整合

手动将数据集1得到的video:::text和video:::url文件（video_text_hash.txt,video_url_hash.txt），和数据集2得到的video:::text和video:::url文件（video_text_hash_4.txt,video_url_hash.txt）合并得到最终video_text_hash.txt和video_url_hash.txt。

小结

目前只处理了这两类数据，数据集中还包含了用户信息之类的其他信息，发布在github.