python - how to equate unigram tfidf to 0 when bigram of related unigram is nonzero? -
python - how to equate unigram tfidf to 0 when bigram of related unigram is nonzero? -
i doing sentiment analysis of film review using python scikit-learn , nltk. want equate elements related unigram 0 (when having opposite polarity) when bigram/ trigram related unigram non zero.
for example:
movie not bad
than feature vector ['movie' 'is' 'not' 'bad' 'movie is' 'is not' 'not bad']=[3 3 1 1 4 2 4]
but want [3 3 0 0 4 2 4] instead.
code:
from sklearn.feature_extraction.text import tfidfvectorizer sklearn.svm import linearsvc import math #######################reading training review phrases , sentiments################################### train_list = [] train_sentiment = [] open('sentences.txt') f: content = f.readlines() sentence in content: train_list.append(sentence.rstrip('\n').split("\t")[0]) train_sentiment.append(sentence.rstrip('\n').split("\t")[1]) #######################number of phrases in each class################################### ex_pos = pos = neu = neg = ex_neg = 0 ex_pos_phrases = pos_phrases = neu_phrases = neg_phrases = ex_neg_phrases = [] open('ex_pos.txt', 'r') ex_posf: ex_pos_phrases = ex_posf.readlines() ex_pos = len(ex_pos_phrases) open('pos.txt', 'r') posf: pos_phrases = posf.readlines() pos = len(pos_phrases) open('neu.txt', 'r') neuf: neu_phrases = neuf.readlines() neu = len(neu_phrases) open('neg.txt', 'r') negf: neg_phrases = negf.readlines() neg = len(neg_phrases) open('ex_neg.txt', 'r') ex_negf: ex_neg_phrases = ex_negf.readlines() ex_neg = len(ex_neg_phrases) print(str(ex_neg) + "," + str(neg) + "," + str(neu) + "," + str(pos) + "," + str(ex_pos)) ####################### getting unique words ################################### unique_words = [] model = tfidfvectorizer(input=train_list) train_tfidf = model.fit_transform(train_list) unique_words = model.get_feature_names() print("##### word sentiment matrix ####") ########################## word sentiment matrix ######################################## word_sentiment = [[0 x in range(5)] x in range(len(unique_words)) ] wordcount = 0 word in unique_words: count = 0 review in ex_neg_phrases: review_words = review.rstrip('\n').split(" ") review_word in review_words: if review_word == word: count += 1 break word_sentiment[wordcount][0] = count count = 0 review in neg_phrases: review_words = review.rstrip('\n').split(" ") review_word in review_words: if review_word == word: count += 1 break word_sentiment[wordcount][1] = count count = 0 review in neu_phrases: review_words = review.rstrip('\n').split(" ") review_word in review_words: if review_word == word: count += 1 break word_sentiment[wordcount][2] = count count = 0 review in ex_pos_phrases: review_words = review.rstrip('\n').split(" ") review_word in review_words: if review_word == word: count += 1 break word_sentiment[wordcount][4] = count count = 0 review in pos_phrases: review_words = review.rstrip('\n').split(" ") review_word in review_words: if review_word == word: count += 1 break word_sentiment[wordcount][3] = count wordcount += 1 print("###the training feature matrix###") #################################the feature matrix####################################### feature_matrix = [[0 x in range(len(unique_words))] x in range(len(train_list))] print(len(feature_matrix)) print(len(feature_matrix[0])) wordcount = 0 unique_word in unique_words: phrasecount = 0 ep = p = nu = en = n = 0 if word_sentiment[wordcount][4] != 0: ep = .35 * math.log(word_sentiment[wordcount][4]/ex_pos) if word_sentiment[wordcount][3] != 0: p = .15 * math.log(word_sentiment[wordcount][3]/pos) if word_sentiment[wordcount][2] != 0: nu = 1 * math.log(word_sentiment[wordcount][2]/neu) if word_sentiment[wordcount][0] != 0: en = -.35 * math.log(word_sentiment[wordcount][0]/ex_neg) if word_sentiment[wordcount][1] != 0: n = -.15 * math.log(word_sentiment[wordcount][1]/neg) phrase in train_list: words = phrase.split(" ") docwordcount = 0 word in words: if word == unique_word: docwordcount += 1 tfidf = (docwordcount * ep) + (docwordcount * p) + (docwordcount * nu) + (docwordcount * en) + (docwordcount * n) feature_matrix[phrasecount][wordcount] = tfidf phrasecount += 1 wordcount += 1 print("###the test feature matrix###") test_list=[] test_phraseid =[] open('sentences_test.txt') f: content = f.readlines() sentence in content: test_list.append(sentence.rstrip('\n').split("\t")[0]) test_phraseid.append(sentence.rstrip('\n').split("\t")[1]) wordcount = 0 test_tfidf = [[0 x in range(len(unique_words))] x in range(len(test_list))] unique_word in unique_words: phrasecount = 0 ep = p = nu = en = n = 0 if word_sentiment[wordcount][4] != 0: ep = .35 * math.log(word_sentiment[wordcount][4]/ex_pos) if word_sentiment[wordcount][3] != 0: p = .15 * math.log(word_sentiment[wordcount][3]/pos) if word_sentiment[wordcount][2] != 0: nu = 1 * math.log(word_sentiment[wordcount][2]/neu) if word_sentiment[wordcount][0] != 0: en = -.35 * math.log(word_sentiment[wordcount][0]/ex_neg) if word_sentiment[wordcount][1] != 0: n = -.15 * math.log(word_sentiment[wordcount][1]/neg) phrase in test_list: words = phrase.split(" ") docwordcount = 0 word in words: if word == unique_word: docwordcount += 1 tfidf = (docwordcount * ep) + (docwordcount * p) + (docwordcount * nu) + (docwordcount * en) + (docwordcount * n) test_tfidf[phrasecount][wordcount] = tfidf phrasecount += 1 wordcount += 1 print("###the linear svc ###") self = linearsvc() self = linearsvc.fit(self, feature_matrix, train_sentiment) test_sentiment = linearsvc.predict(self, test_tfidf) open('output_deltatfidf.csv', 'w') fil: fil.write("phraseid,sentiment\n") x in range(0, len(test_sentiment)): fil.write(test_phraseid[x] + "," + test_sentiment[x] + "\n") python sentiment-analysis n-gram
Comments
Post a Comment