from collections import Counter
import nltk
import re
import pickle

# VARIABLES

source = open("pos_obama.txt", "r")
destination = open("obamas_trigrams.txt", "w")
destination.write("OBAMA\S MOST FREQUENT TRIGRAMS with Penn's TREEBANK\n\n\n")

# source = open("pos_trump.txt", "r")
# destination = open("trumps_trigrams.txt", "w")
# destination.write("TRUMPS MOST FREQUENT TRIGRAMS with Penn's TREEBANK\n\n\n")

# FUNCTIONS
## sort words by frequency (import module)
def sort_dict(frequency_d):
        c=Counter(frequency_d)
        frequency = c.most_common()
        return frequency

## MAKE SURE ALL VARIABLES ARE DECLARED WITHIN THE LOOPS
# 1. Create dictionary of trigrams
trigrams = {}
for line in source:
        # remove punctuation
        clean_tri = []
        words = line.split(" ")
        for word in words:
                cleaning = re.compile(r"[A-Za-z0-9]")
                if cleaning.match(word):
                        clean_tri.append(word)
                else:
                        pass
        # find trigrams
        tricount = nltk.trigrams(clean_tri)
        # count frequency of each trigram and add trigram + value in dictionary
        for trigram in tricount:
                if trigram in trigrams:
                        trigrams[trigram] += 1
                else:
                        trigrams[trigram] = 1

trigrams_sorted = sort_dict(trigrams)
first10pairs = trigrams_sorted[:10]

with destination as text:
        for tri, frequency in first10pairs:
                text.write("{} : {} \n".format(tri, frequency))