#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import nltk
from pattern.en import tag
import nltk.data
from random import shuffle, choice
# VARIABLES
# texts
source = open("obama.txt", "r")
destination = open("pos_obama.txt", "wt")
destination.write("OBAMAS SYNTAX using PENN'S TREEBANK\n\n")
#source = open("trump.txt", "r")
#destination = open("pos_trump.txt", "wt")
# destination.write("TRUMPS SYNTAX using PENN'S TREEBANK\n\n")
# FUNCTIONS
## SCRIPT
# select 1 or more sentences from source
## split source text into list of sentences
finding_sentences = nltk.data.load('tokenizers/punkt/english.pickle')
sentences_list = []
with source as text0:
for line in text0:
# this returns a list with 1 element containing the entire text, sentences separated by \n
sentences = '\n'.join(finding_sentences.tokenize(line.strip()))
# transform string into list of sentences
sentences_list = sentences.split("\n")
with destination as text1:
for sentence in sentences_list:
# create tuple of tuples with pairs of word + POS-tag
collection = tag(sentence, tokenize=True, encoding='utf-8')
# transform tuple into list to be able to manipulate it
collection = list(collection)
for element in collection:
text1.write(element[1] + " ")