Welcome to Etherpad!

This pad text is synchronized as you type, so that everyone viewing this page sees the same text. This allows you to collaborate seamlessly on documents!

Get involved with Etherpad at http://etherpad.orgfrom collections import Counter 
import nltk 
import re 
import pickle 


# VARIABLES 

source = open("pos_obama.txt", "r") 
destination = open("obamas_trigrams.txt", "w") 
destination.write("OBAMA\S MOST FREQUENT TRIGRAMS with Penn's TREEBANK\n\n\n") 

# source = open("pos_trump.txt", "r") 
# destination = open("trumps_trigrams.txt", "w") 
# destination.write("TRUMPS MOST FREQUENT TRIGRAMS with Penn's TREEBANK\n\n\n") 


# FUNCTIONS 
## sort words by frequency (import module) 
def sort_dict(frequency_d): 
        c=Counter(frequency_d) 
        frequency = c.most_common() 
        return frequency 

## MAKE SURE ALL VARIABLES ARE DECLARED WITHIN THE LOOPS                 

# 1. Create dictionary of trigrams 
trigrams = {} 
for line in source: 
        # remove punctuation 
        clean_tri = [] 
        words = line.split(" ") 
        for word in words: 
                cleaning = re.compile(r"[A-Za-z0-9]") 
                if cleaning.match(word): 
                        clean_tri.append(word) 
                else: 
                        pass 
        # find trigrams 
        tricount = nltk.trigrams(clean_tri)         
        # count frequency of each trigram and add trigram + value in dictionary                         
        for trigram in tricount: 
                if trigram in trigrams: 
                        trigrams[trigram] += 1 
                else: 
                        trigrams[trigram] = 1         

trigrams_sorted = sort_dict(trigrams) 
first10pairs = trigrams_sorted[:10] 


with destination as text: 
        for tri, frequency in first10pairs: 
                text.write("{} : {} \n".format(tri, frequency))