#!/usr/bin/env python
# -*- coding: utf-8 -*-


from __future__ import division
import nltk
from pattern.en import tag
import nltk.data
from random import shuffle, choice


# VARIABLES


# texts
source = open("obama.txt", "r")
destination = open("pos_obama.txt", "wt")
destination.write("OBAMAS SYNTAX using PENN'S TREEBANK\n\n")


#source = open("trump.txt", "r")
#destination = open("pos_trump.txt", "wt")
# destination.write("TRUMPS SYNTAX using PENN'S TREEBANK\n\n")



# FUNCTIONS

## SCRIPT

# select 1 or more sentences from source
## split source text into list of sentences
finding_sentences = nltk.data.load('tokenizers/punkt/english.pickle')
sentences_list = []
with source as text0:
    for line in text0:
        # this returns a list with 1 element containing the entire text, sentences separated by \n
        sentences = '\n'.join(finding_sentences.tokenize(line.strip()))
        # transform string into list of sentences
        sentences_list = sentences.split("\n") 

with destination as text1:
        for sentence in sentences_list:
        # create tuple of tuples with pairs of word + POS-tag
                collection = tag(sentence, tokenize=True, encoding='utf-8')
        # transform tuple into list to be able to manipulate it
                collection = list(collection)
                for element in collection:
                        text1.write(element[1] + " ")