python - how to equate unigram tfidf to 0 when bigram of related unigram is nonzero? -

i doing sentiment analysis of film review using python scikit-learn , nltk. want equate elements related unigram 0 (when having opposite polarity) when bigram/ trigram related unigram non zero.

for example:

movie not bad

than feature vector ['movie' 'is' 'not' 'bad' 'movie is' 'is not' 'not bad']=[3 3 1 1 4 2 4]

but want [3 3 0 0 4 2 4] instead.

code:

from sklearn.feature_extraction.text import tfidfvectorizer sklearn.svm import linearsvc  import math #######################reading training review phrases , sentiments###################################  train_list = [] train_sentiment = [] open('sentences.txt') f:     content = f.readlines() sentence in content:     train_list.append(sentence.rstrip('\n').split("\t")[0])     train_sentiment.append(sentence.rstrip('\n').split("\t")[1])  #######################number of phrases in each class###################################  ex_pos = pos = neu = neg = ex_neg = 0 ex_pos_phrases = pos_phrases = neu_phrases = neg_phrases = ex_neg_phrases = []  open('ex_pos.txt', 'r') ex_posf:     ex_pos_phrases = ex_posf.readlines()     ex_pos = len(ex_pos_phrases) open('pos.txt', 'r') posf:     pos_phrases = posf.readlines()     pos = len(pos_phrases) open('neu.txt', 'r') neuf:     neu_phrases = neuf.readlines()     neu = len(neu_phrases) open('neg.txt', 'r') negf:     neg_phrases = negf.readlines()     neg = len(neg_phrases) open('ex_neg.txt', 'r') ex_negf:     ex_neg_phrases = ex_negf.readlines()     ex_neg = len(ex_neg_phrases)  print(str(ex_neg) + "," + str(neg) + "," + str(neu) + "," + str(pos) + "," + str(ex_pos))  ####################### getting unique words ###################################  unique_words = [] model = tfidfvectorizer(input=train_list) train_tfidf = model.fit_transform(train_list) unique_words = model.get_feature_names()  print("##### word sentiment matrix ####") ########################## word sentiment matrix ########################################  word_sentiment = [[0 x in range(5)] x in range(len(unique_words)) ] wordcount = 0 word in unique_words:     count = 0     review in ex_neg_phrases:         review_words = review.rstrip('\n').split(" ")         review_word in review_words:             if review_word == word:                 count += 1                 break     word_sentiment[wordcount][0] = count     count = 0     review in neg_phrases:         review_words = review.rstrip('\n').split(" ")         review_word in review_words:             if review_word == word:                 count += 1                 break     word_sentiment[wordcount][1] = count     count = 0     review in neu_phrases:         review_words = review.rstrip('\n').split(" ")         review_word in review_words:             if review_word == word:                 count += 1                 break     word_sentiment[wordcount][2] = count     count = 0     review in ex_pos_phrases:         review_words = review.rstrip('\n').split(" ")         review_word in review_words:             if review_word == word:                 count += 1                 break     word_sentiment[wordcount][4] = count     count = 0     review in pos_phrases:         review_words = review.rstrip('\n').split(" ")         review_word in review_words:             if review_word == word:                 count += 1                 break     word_sentiment[wordcount][3] = count     wordcount += 1 print("###the training feature matrix###") #################################the feature matrix####################################### feature_matrix = [[0 x in range(len(unique_words))] x in range(len(train_list))] print(len(feature_matrix)) print(len(feature_matrix[0]))  wordcount = 0 unique_word in unique_words:     phrasecount = 0     ep = p = nu = en = n = 0     if word_sentiment[wordcount][4] != 0:         ep = .35 * math.log(word_sentiment[wordcount][4]/ex_pos)     if word_sentiment[wordcount][3] != 0:         p = .15 * math.log(word_sentiment[wordcount][3]/pos)     if word_sentiment[wordcount][2] != 0:         nu = 1 * math.log(word_sentiment[wordcount][2]/neu)     if word_sentiment[wordcount][0] != 0:         en = -.35 * math.log(word_sentiment[wordcount][0]/ex_neg)     if word_sentiment[wordcount][1] != 0:         n = -.15 * math.log(word_sentiment[wordcount][1]/neg)      phrase in train_list:         words = phrase.split(" ")         docwordcount = 0         word in words:             if word == unique_word:                 docwordcount += 1         tfidf = (docwordcount * ep) + (docwordcount * p) + (docwordcount * nu) + (docwordcount * en) + (docwordcount * n)         feature_matrix[phrasecount][wordcount] = tfidf         phrasecount += 1      wordcount += 1  print("###the test feature matrix###")  test_list=[] test_phraseid =[] open('sentences_test.txt') f:     content = f.readlines() sentence in content:     test_list.append(sentence.rstrip('\n').split("\t")[0])     test_phraseid.append(sentence.rstrip('\n').split("\t")[1])  wordcount = 0 test_tfidf = [[0 x in range(len(unique_words))] x in range(len(test_list))]  unique_word in unique_words:     phrasecount = 0     ep = p = nu = en = n = 0     if word_sentiment[wordcount][4] != 0:         ep = .35 * math.log(word_sentiment[wordcount][4]/ex_pos)     if word_sentiment[wordcount][3] != 0:         p = .15 * math.log(word_sentiment[wordcount][3]/pos)     if word_sentiment[wordcount][2] != 0:         nu = 1 * math.log(word_sentiment[wordcount][2]/neu)     if word_sentiment[wordcount][0] != 0:         en = -.35 * math.log(word_sentiment[wordcount][0]/ex_neg)     if word_sentiment[wordcount][1] != 0:         n = -.15 * math.log(word_sentiment[wordcount][1]/neg)      phrase in test_list:         words = phrase.split(" ")         docwordcount = 0         word in words:             if word == unique_word:                 docwordcount += 1         tfidf = (docwordcount * ep) + (docwordcount * p) + (docwordcount * nu) + (docwordcount * en) + (docwordcount * n)         test_tfidf[phrasecount][wordcount] = tfidf         phrasecount += 1     wordcount += 1  print("###the linear svc ###")  self = linearsvc() self = linearsvc.fit(self, feature_matrix, train_sentiment) test_sentiment = linearsvc.predict(self, test_tfidf)  open('output_deltatfidf.csv', 'w') fil:     fil.write("phraseid,sentiment\n")     x in range(0, len(test_sentiment)):         fil.write(test_phraseid[x] + "," + test_sentiment[x] + "\n")

python sentiment-analysis n-gram

Search This Blog

Jaimee

python - how to equate unigram tfidf to 0 when bigram of related unigram is nonzero? -

Comments

Post a Comment

Popular posts from this blog

javascript - THREE.js reposition vertices for RingGeometry -

javascript - I need to update the text of a paragraph by inline edit -

assembly - What is the addressing mode for ld, add, and rjmp instructions? -