pos.py

Welcome to Etherpad!

This pad text is synchronized as you type, so that everyone viewing this page sees the same text. This allows you to collaborate seamlessly on documents!

Get involved with Etherpad at http://etherpad.org
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import nltk
from pattern.en import tag
import nltk.data
from random import shuffle, choice

# VARIABLES

# texts
source = open("obama.txt", "r")
destination = open("pos_obama.txt", "wt")
destination.write("OBAMAS SYNTAX using PENN'S TREEBANK\n\n")

#source = open("trump.txt", "r")
#destination = open("pos_trump.txt", "wt")
# destination.write("TRUMPS SYNTAX using PENN'S TREEBANK\n\n")

# FUNCTIONS

## SCRIPT

# select 1 or more sentences from source
## split source text into list of sentences
finding_sentences = nltk.data.load('tokenizers/punkt/english.pickle')
sentences_list = []
with source as text0:
    for line in text0:
        # this returns a list with 1 element containing the entire text, sentences separated by \n
        sentences = '\n'.join(finding_sentences.tokenize(line.strip()))
        # transform string into list of sentences
        sentences_list = sentences.split("\n")

with destination as text1:
        for sentence in sentences_list:
        # create tuple of tuples with pairs of word + POS-tag
                collection = tag(sentence, tokenize=True, encoding='utf-8')
        # transform tuple into list to be able to manipulate it
                collection = list(collection)
                for element in collection:
                        text1.write(element[1] + " ")