from collections import Counter
import nltk
import re
import pickle
# VARIABLES
source = open("pos_obama.txt", "r")
destination = open("obamas_trigrams.txt", "w")
destination.write("OBAMA\S MOST FREQUENT TRIGRAMS with Penn's TREEBANK\n\n\n")
# source = open("pos_trump.txt", "r")
# destination = open("trumps_trigrams.txt", "w")
# destination.write("TRUMPS MOST FREQUENT TRIGRAMS with Penn's TREEBANK\n\n\n")
# FUNCTIONS
## sort words by frequency (import module)
def sort_dict(frequency_d):
c=Counter(frequency_d)
frequency = c.most_common()
return frequency
## MAKE SURE ALL VARIABLES ARE DECLARED WITHIN THE LOOPS
# 1. Create dictionary of trigrams
trigrams = {}
for line in source:
# remove punctuation
clean_tri = []
words = line.split(" ")
for word in words:
cleaning = re.compile(r"[A-Za-z0-9]")
if cleaning.match(word):
clean_tri.append(word)
else:
pass
# find trigrams
tricount = nltk.trigrams(clean_tri)
# count frequency of each trigram and add trigram + value in dictionary
for trigram in tricount:
if trigram in trigrams:
trigrams[trigram] += 1
else:
trigrams[trigram] = 1
trigrams_sorted = sort_dict(trigrams)
first10pairs = trigrams_sorted[:10]
with destination as text:
for tri, frequency in first10pairs:
text.write("{} : {} \n".format(tri, frequency))