#!/usr/bin/env python3.6
# -*- coding: utf-8 -*-
import csv
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import time
import re
import spacy
import pytextrank
import nltk
import random
random.seed(0)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.stem.wordnet import WordNetLemmatizer as WNL
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from operator import itemgetter

nlp = spacy.load('en_core_web_sm', disable = ['ner'])
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
wnl = WNL()
csv.field_size_limit(1000000000)

a_list = []
b_list = []
c_list = []
bow = []
p_list = []
l_list = []
dic1 = []
title = []
d_list =[]

cs = open('ラベルデータ-201114-1230.csv', encoding='UTF-8', errors='ignore')

f = open("BOW.csv")

for row in csv.reader(cs):
    c_list.append(row[1])
    p_list.append(row[2])
    l_list.append(row[5])
    a_list.append(row[3])
    title.append(row[4])
for row in csv.reader(f):
    bow.append(row[0])
    
del a_list[0]
del c_list[0]
del p_list[0]
del l_list[0]
del title[0]

for i in a_list:
    n = a_list.index(i)
    a_list[n] = i.replace(title[n],'')

for text in a_list:
    doc = nlp(text)
    text_box = ''
    for sent in doc.sents:
        text_box += sent.text + '/#'
    b_list.append(text_box)


nn = [c_list, p_list, title, a_list, b_list, bow, l_list]
df = pd.DataFrame(nn)
df = df.T
df.to_csv('BoW付きデータ-201203-1700.csv', encoding='UTF-8')