### Python 2.7 from __future__ import division import nltk from nltk.corpus import wordnet as wn from pattern.en import tag # choose text source & open it source = open("frankenstein_fragment.txt", 'r') # tokenize source and get Part-of-Speech tags for each word definitions = [] for line in source: # create tuple of tuples with pairs of word + POS-tag collection = tag(line, tokenize=True, encoding='utf-8') # transform tuple into list to be able to manipulate it collection = list(collection) # for each pair: for element in collection: # look for nouns & replace them with their definition if element[1] == "NN": synset = wn.synsets(element[0]) definitions.append("<") definitions.append(synset[0].definition()) definitions.append(">") else: # non-nouns are left as words definitions.append(element[0]) # write the transformed sentence print(" ".join(definitions)) # close the text file source.close() # ---------------------------------------------------------------- ## alternative using nltk ## tokenize in words #tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') ##to tokenize input text into sentences #print '\n-----\n'.join(tokenizer.tokenize(data))# splits text into sentences ##to tokenize the tokenized sentences into words # tokens = nltk.wordpunct_tokenize(data) # text = nltk.Text(tokens) # words = [w.lower() for w in text]