#
import csv
import warnings
warnings.filterwarnings("ignore")
import time
import re
import spacy
import pytextrank
import nltk
import random
random.seed(0)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.stem.wordnet import WordNetLemmatizer as WNL
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from operator import itemgetter
from flask import Flask, render_template, request, g
app = Flask(__name__)
from elasticsearch import Elasticsearch
from googletrans import Translator

@app.route('/')
def form():
   return render_template('form.html')

@app.route('/elasticsearch', methods = ['POST', 'GET'])
def search():
    t1 = time.time()
    if request.method == 'POST':
        result = str(request.form['raw_text'])
        es = Elasticsearch(host = '153.120.135.103', port = 9200, http_auth = ('elastic', 'gui3DhVRfd9F18n30o34'))
        f_res = es.search(index = 'securityreports-2020-1009-1413', body = {
            '_source':['label', 'Title'],
            'size':1,
            'query':{'match':{'Text': result }}})
        for doc in f_res['hits']['hits']:
            dict1 = doc['_source']
            label = dict1['label']
            #print(dict1['Title'])
        res = es.search(index = 'securityreports-2020-1009-1413', body = {
            '_source':['Title','Text','label','発行元'],
            'size':10000,
            'query':{'match_phrase':{'label': label }}})
        hit_num = res['hits']['total']['value']
        #print(hit_num)
        #print(label)
        with open('psr.csv', 'w', encoding='CP932', errors='ignore', newline='') as f:
            header_present = False
            for doc in res['hits']['hits']:
                my_dict = doc['_source']
                if not header_present:
                    w = csv.DictWriter(f, my_dict.keys())
                    w.writeheader()
                    header_present = True
                w.writerow(my_dict)
                print(my_dict)
        csvfile = open('psr.csv', encoding='CP932', errors='ignore')
        a_list = []
        for row in csv.reader(csvfile):
            a_list.append(row[1])
        del a_list[0]
        random_sents = random.sample(a_list, 50)
        reports = ' '.join(random_sents)
        nlp = spacy.load('en_core_web_sm', disable = ['ner'])
        tr = pytextrank.TextRank()
        nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
        doc = nlp(reports)
        limit_phrases = 15
        phrase_id = 0
        wnl = WNL()
        unit_vector = []
        phrases = []
        phrase = []
        rank_vector = []
        for p in doc._.phrases:
            unit_vector.append(p.rank)
            phrases.append(p.text)
            word_tokenize_list = word_tokenize(p.text.lower())
            a = ''
            for lemma in word_tokenize_list:
                a += ' '
                a += wnl.lemmatize(lemma)
            phrase.append(a)
            #phraseは上位フレーズのステミング済みリスト
            phrase_id += 1
            if phrase_id == limit_phrases:
                break
        sum_ranks = sum(unit_vector)
        rank_vector = [ rank/sum_ranks for rank in unit_vector ]
        sent_list = []
        sent_rank = {}
        sent_id = 0
        sum_vec = 0.0
        for sent in doc.sents:
            a = str(sent)
            text_tokenize_list = word_tokenize(a.lower())
            text_box = ''
            pnum = []
            score = 0
            for text in text_tokenize_list:
                text_box += ' '
                text_box += wnl.lemmatize(text)
            for p in phrase:
                pnum.append(text_box.count(p))
                score += (text_box.count(p))*(unit_vector[phrase.index(p)])
            sent_rank[sent_id] = score
            sent_id += 1
            sent_list.append(pnum)
            #sent_listは各文のBoWベクトル
        sent_id = 0
        sent_text = {}
        sentence = []
        limit_sentences = 10
        for sent in doc.sents:
            sent_text[sent_id] = sent.text
            sent_id += 1
        num_sent = 0
        for sent_id, rank in sorted(sent_rank.items(), key=itemgetter(1), reverse = True):
            sentence.append(sent_text[sent_id])
            num_sent += 1
            if num_sent == limit_sentences:
                break
        with open('sentence.txt', 'w', encoding='CP932', errors='ignore', newline='') as f:
            f.write('\n'.join(sentence))
        htmltext = []
        for s in sentence:
            text_tokenize_list = word_tokenize(s.lower())
            text_box = ''
            sent_box = ''
            for text in text_tokenize_list:
                text_box += ' '
                text_box += wnl.lemmatize(text)
            sent_box += s + ' '
            for p in phrase:
                senty = re.compile(p, re.IGNORECASE)
                sent_box = re.sub(senty, '<span style="font-weight: 600; color:orange;">' + p + '</span>', sent_box)
            for p in phrases:
                s = re.compile(p, re.IGNORECASE)
                sent_box = re.sub(s, '<span style="font-weight: 600; color:orange;">' + p + '</span>', sent_box)
            htmltext.append(sent_box)
        t2 = time.time()
        print(t2-t1) 
        return render_template("search.html", label = label, sentence = sentence, hits = hit_num, phrase_list = phrase, relascore = rank_vector, html = htmltext)
            
@app.route('/nonchange', methods = ['POST', 'GET'])
def ten_phrases():
    t1 = time.time()
    #t1:スタート
    if request.method == 'POST':
        translator = Translator()
        csvfile = open('psr.csv', encoding='CP932', errors='ignore')
        a_list = []
        b_list = []
        p_list = []
        for row in csv.reader(csvfile):
            a_list.append(row[1])
            b_list.append(row[0])
            p_list.append(row[3])
        del a_list[0]
        del b_list[0]
        del p_list[0]
        t2 = time.time()
        #t2:csvfile読み込み
        texts = []
        reports = ' '.join(a_list)
        texts.extend(a_list)
        nlp = spacy.load('en_core_web_sm', disable = ['ner'])
        tr = pytextrank.TextRank()
        nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
        t3 = time.time()
        #t3:spaCy準備
        with open('sentence.txt', encoding='cp932', errors='ignore') as f:
            c_list = [s.strip() for s in f.readlines()]
        result = request.form.getlist('sentences')
        result = [int(s) for s in result]
        sentence = []
        for i in result:
            sentence.append(c_list[i])
        raw_text = ' '.join(sentence)
        t4 = time.time()
        #t4:text準備
        doc = nlp(raw_text)
        t5 = time.time()
        #t5:doc準備
        phrase_id = 0
        wnl = WNL()
        unit_vector = []
        phrases = []
        phrase = []
        text_rank = {}
        htmltext = []
        title_list = []
        publish_list = []
        limit_phrases = 15
        t6 = time.time()
        #t6:list準備
        for p in doc._.phrases:
            unit_vector.append(p.rank)
            phrases.append(p.text)
            word_tokenize_list = word_tokenize(p.text.lower())
            a = ''
            for lemma in word_tokenize_list:
                a += ' '
                a += wnl.lemmatize(lemma)
            phrase.append(a)
            phrase_id += 1
            if phrase_id == limit_phrases:
                break
        t7 = time.time()
        #t7:上位フレーズ決定
        sum_ranks = sum(unit_vector)
        unit_vector = [ rank/sum_ranks for rank in unit_vector ]
        t772 = 0
        t773 = 0
        t774 = 0
        for i in range(len(texts)):
            t71 = time.time()
            doc = nlp(texts[i])
            t72 = time.time()
            t772 += t72-t71
            sent_list = []
            sent_rank = {}
            sent_id = 0
            sum_vec = 0.0
            entexts = []
            for sent in doc.sents:
                a = str(sent)
                text_tokenize_list = word_tokenize(a.lower())
                text_box = ''
                pnum = []
                score = 0
                for text in text_tokenize_list:
                    text_box += ' '
                    text_box += wnl.lemmatize(text)
                for p in phrase:
                    pnum.append(text_box.count(p))
                sent_id += 1
                sent_list.append(pnum)
            t73 = time.time()
            t773 += t73-t72
            for s in sent_list:
                score = 0
                for phrase_id in range(len(unit_vector)):
                    score += s[phrase_id]*unit_vector[phrase_id]
                sent_rank[sent_id] = score
                sum_vec += score
                sent_id += 1
            text_rank[i] = sum_vec/len(sent_list)
            t74 = time.time()
            t774 += t74-t73
        #print('t72-t71:', t772)
        #print('t73-t72:', t773)
        #print('t74-t73:', t774)
        t8 = time.time()
        #t8:類似レポートの出力
        rank_list = sorted(text_rank.items(), key=itemgetter(1), reverse = True)
        for l in rank_list:
            entexts.append(a_list[l[0]])
            title_list.append(b_list[l[0]])
            publish_list.append(p_list[l[0]])
            if rank_list.index(l) == 9:
                          break
        text_list =[]
        t9 = time.time()
        #t9:色付け準備
        t992 = 0
        t993 = 0
        for i in range(len(entexts)):
            t91 = time.time()
            doc = nlp(entexts[i])   
            t92 = time.time()
            t992 += t92-t91    
            sent_box = ''
            for sent in doc.sents:
                a = str(sent)
                text_tokenize_list = word_tokenize(a.lower())
                text_box = ''
                pnum = []
                score = 0
                for text in text_tokenize_list:
                    text_box += ' '
                    text_box += wnl.lemmatize(text)
                for p in phrase:
                    pnum.append(text_box.count(p))
                sent_id += 1
                sent_list.append(pnum)
                if sum(pnum) == 0:
                    sent_box += str(sent) + ' '
                elif sum(pnum) == 1:
                    sent_box += '<span style="background: linear-gradient(transparent 90%, #F6AD3C 0%);">' + str(sent) + '</span>' + ' '
                elif sum(pnum) == 2:
                    sent_box += '<span style="background: linear-gradient(transparent 60%, #F6AD3C 0%);">' + str(sent) + '</span>' + ' '
                else:
                    sent_box += '<span style="background: linear-gradient(transparent 0%, #F6AD3C 0%);">' + str(sent) + '</span>' + ' '
                for p in phrase:
                    senty = re.compile(p, re.IGNORECASE)
                    sent_box = re.sub(senty, '<span style="font-weight: 600; color:red;">' + p + '</span>', sent_box)
                for p in phrases:
                    senty = re.compile(p, re.IGNORECASE)
                    sent_box = re.sub(senty, '<span style="font-weight: 600; color:red;">' + p + '</span>', sent_box)
            htmltext.append(sent_box)
            t93 = time.time()
            t993 += t93-t92
        #print('t93-t92:', t993)
        #print('t92-t91:', t992)
        t10 = time.time()
        #t10:色付け
        print(t10-t1)
        return render_template("nonchange.html", phrase_list = phrases, text_list = htmltext, title = title_list, publish = publish_list)

if __name__ == "__main__":
    app.run(host='153.120.135.103', port = 80, debug=True)