learning_definitions.py

Welcome to Etherpad!

This pad text is synchronized as you type, so that everyone viewing this page sees the same text. This allows you to collaborate seamlessly on documents!

Get involved with Etherpad at http://etherpad.org
### Python 2.7

from __future__ import division
import nltk
from nltk.corpus import wordnet as wn
from pattern.en import tag

# choose text source & open it
source = open("frankenstein_fragment.txt", 'r')

# tokenize source and get Part-of-Speech tags for each word
definitions = []

for line in source:
        # create tuple of tuples with pairs of word + POS-tag
        collection = tag(line, tokenize=True, encoding='utf-8')
        # transform tuple into list to be able to manipulate it
        collection = list(collection)
        # for each pair:
        for element in collection:
                # look for nouns & replace them with their definition
                if element[1] == "NN":
                        synset = wn.synsets(element[0])
                        definitions.append("<")
                        definitions.append(synset[0].definition())
                        definitions.append(">")
                else:
                        # non-nouns are left as words
                        definitions.append(element[0])

# write the transformed sentence
print(" ".join(definitions))

# close the text file
source.close()

# ----------------------------------------------------------------
## alternative using nltk
## tokenize in words
#tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

##to tokenize input text into sentences
#print '\n-----\n'.join(tokenizer.tokenize(data))# splits text into sentences

##to tokenize the tokenized sentences into words
# tokens = nltk.wordpunct_tokenize(data)
# text = nltk.Text(tokens)
# words = [w.lower() for w in text]