frequency_pos.py

Welcome to Etherpad!

This pad text is synchronized as you type, so that everyone viewing this page sees the same text. This allows you to collaborate seamlessly on documents!

Get involved with Etherpad at http://etherpad.orgfrom collections import Counter
import nltk
import re
import pickle

# VARIABLES

source = open("pos_obama.txt", "r")
destination = open("obamas_trigrams.txt", "w")
destination.write("OBAMA\S MOST FREQUENT TRIGRAMS with Penn's TREEBANK\n\n\n")

# source = open("pos_trump.txt", "r")
# destination = open("trumps_trigrams.txt", "w")
# destination.write("TRUMPS MOST FREQUENT TRIGRAMS with Penn's TREEBANK\n\n\n")

# FUNCTIONS
## sort words by frequency (import module)
def sort_dict(frequency_d):
        c=Counter(frequency_d)
        frequency = c.most_common()
        return frequency

## MAKE SURE ALL VARIABLES ARE DECLARED WITHIN THE LOOPS

# 1. Create dictionary of trigrams
trigrams = {}
for line in source:
        # remove punctuation
        clean_tri = []
        words = line.split(" ")
        for word in words:
                cleaning = re.compile(r"[A-Za-z0-9]")
                if cleaning.match(word):
                        clean_tri.append(word)
                else:
                        pass
        # find trigrams
        tricount = nltk.trigrams(clean_tri)
        # count frequency of each trigram and add trigram + value in dictionary
        for trigram in tricount:
                if trigram in trigrams:
                        trigrams[trigram] += 1
                else:
                        trigrams[trigram] = 1

trigrams_sorted = sort_dict(trigrams)
first10pairs = trigrams_sorted[:10]

with destination as text:
        for tri, frequency in first10pairs:
                text.write("{} : {} \n".format(tri, frequency))