from collections import Counter 
import nltk 
import re 
import pickle 


# VARIABLES 

source = open("pos_obama.txt", "r") 
destination = open("obamas_trigrams.txt", "w") 
destination.write("OBAMA\S MOST FREQUENT TRIGRAMS with Penn's TREEBANK\n\n\n") 

# source = open("pos_trump.txt", "r") 
# destination = open("trumps_trigrams.txt", "w") 
# destination.write("TRUMPS MOST FREQUENT TRIGRAMS with Penn's TREEBANK\n\n\n") 


# FUNCTIONS 
## sort words by frequency (import module) 
def sort_dict(frequency_d): 
        c=Counter(frequency_d) 
        frequency = c.most_common() 
        return frequency 

## MAKE SURE ALL VARIABLES ARE DECLARED WITHIN THE LOOPS                 

# 1. Create dictionary of trigrams 
trigrams = {} 
for line in source: 
        # remove punctuation 
        clean_tri = [] 
        words = line.split(" ") 
        for word in words: 
                cleaning = re.compile(r"[A-Za-z0-9]") 
                if cleaning.match(word): 
                        clean_tri.append(word) 
                else: 
                        pass 
        # find trigrams 
        tricount = nltk.trigrams(clean_tri)         
        # count frequency of each trigram and add trigram + value in dictionary                         
        for trigram in tricount: 
                if trigram in trigrams: 
                        trigrams[trigram] += 1 
                else: 
                        trigrams[trigram] = 1         

trigrams_sorted = sort_dict(trigrams) 
first10pairs = trigrams_sorted[:10] 


with destination as text: 
        for tri, frequency in first10pairs: 
                text.write("{} : {} \n".format(tri, frequency))