(Code) Python implementation of phrase extraction from sentence

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import numpy as np 
import string
import nltk
from nltk.tokenize import word_tokenize
from textblob import TextBlob

import pdb

max_phrase_length = 5 

basicPath = '/media/wangxiao/b8efbc67-7ea5-476d-9631-70da75f84e2d/test_dataset/'

path = basicPath
files = os.listdir(path) 
print(path)

word_base_path =  '/media/wangxiao/b8efbc67-7ea5-476d-9631-70da75f84e2d/train_dataset/word_list.txt'
wordBase = open(word_base_path, 'r')
wordList = [] 

lines = wordBase.readlines()
for line in lines: 
    line_ = line.rstrip('\n').rstrip('.')
    # pdb.set_trace()
    wordList.append(line_)    


for i in range(len(files)):
    videoName = files[i]
    print videoName 
    # langPath = path + videoName + '/language.txt'        ## for other datset 
    langPath = path + videoName + '/' + videoName+'.txt'    
    f = open(langPath, 'r')
    language = f.readline()
    words = word_tokenize(language)
    token_results = nltk.pos_tag(words)
    blob = TextBlob(language)
    
    print blob.noun_phrases

    langPath_Phrase = path + videoName + '/auto_extracted_Phrase.txt'
    f_phrase = open(langPath_Phrase, 'w')

    langPath_PhraseIndex = path + videoName + '/autoExtracted_Phrase_Index.txt'
    f_phrase_Idx = open(langPath_PhraseIndex, 'w')    

    # pdb.set_trace()
    written_num = 0

    for j in range(len(blob.noun_phrases)):
        phrase = blob.noun_phrases[j]
        f_phrase.write(phrase)
        f_phrase.write('\n')

        if len(phrase) > 1:  
            word_ = word_tokenize(phrase)

            for phraseIndex in range(len(word_)):
                wordINDEX = wordList.index(word_[phraseIndex])
                f_phrase_Idx.write(str(wordINDEX))
                f_phrase_Idx.write(',') 
                written_num = written_num + 1 

            if written_num < max_phrase_length: 
                diff_num = max_phrase_length - written_num 
                for k in range(diff_num): 
                    f_phrase_Idx.write('0')  
                    f_phrase_Idx.write(',')

猜你喜欢

转载自www.cnblogs.com/wangxiaocvpr/p/10571212.html