#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import division import nltk from pattern.en import tag import nltk.data from random import shuffle, choice # VARIABLES # texts source = open("obama.txt", "r") destination = open("pos_obama.txt", "wt") destination.write("OBAMAS SYNTAX using PENN'S TREEBANK\n\n") #source = open("trump.txt", "r") #destination = open("pos_trump.txt", "wt") # destination.write("TRUMPS SYNTAX using PENN'S TREEBANK\n\n") # FUNCTIONS ## SCRIPT # select 1 or more sentences from source ## split source text into list of sentences finding_sentences = nltk.data.load('tokenizers/punkt/english.pickle') sentences_list = [] with source as text0: for line in text0: # this returns a list with 1 element containing the entire text, sentences separated by \n sentences = '\n'.join(finding_sentences.tokenize(line.strip())) # transform string into list of sentences sentences_list = sentences.split("\n") with destination as text1: for sentence in sentences_list: # create tuple of tuples with pairs of word + POS-tag collection = tag(sentence, tokenize=True, encoding='utf-8') # transform tuple into list to be able to manipulate it collection = list(collection) for element in collection: text1.write(element[1] + " ")