import nltk
nltk.download('all')
# removing stop words from corpus
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
example_sent = "This is a sample sentence, showing off the stop words filtration."
stop_words = set(stopwords.words('english'))
# print(stop_words) # prints the stop words defined by NLTK for the english language
word_tokens = word_tokenize(example_sent)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
print(word_tokens)
print(filtered_sentence)
Need for stemming
The idea of stemming is a sort of normalizing method. Many variations of words carry the same meaning, other than when tense is involved.
The reason why we stem is to shorten the lookup, and normalize sentences.
Consider:
I was taking a ride in the car.
I was riding in the car.
This sentence means the same thing. in the car is the same. I was is the same. the ing denotes a clear past-tense in both cases, so is it truly necessary to differentiate between ride and riding, in the case of just trying to figure out the meaning of what this past-tense activity was?
No, not really.
This is just one minor example, but imagine every word in the English language, every possible tense and affix you can put on a word. Having individual dictionary entries per version would be highly redundant and inefficient, especially since, once we convert to numbers, the "value" is going to be identical.
One of the most popular stemming algorithms is the Porter stemmer, which has been around since 1979.
# using PorterStemmer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
ps = PorterStemmer()
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
print(example_words)
# stemming example_words
for w in example_words:
print(ps.stem(w))
# stemming sentences
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
words = word_tokenize(new_text)
for word in words:
print(word, '\t\t', ps.stem(word))
One of the more powerful aspects of the NLTK module is the Part of Speech tagging that it can do for you. This means labeling words in a sentence as nouns, adjectives, verbs...etc. Even more impressive, it also labels by tense, and more. Here's a list of the tags, what they mean, and some examples:
POS tag list:
WRB - wh-abverb where, when
PRP$ - possessive pronoun my, his, hers
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
# This tokenizer(PunktSentenceTokenizer) is capable of unsupervised machine learning, so you can actually train it on any body of text that you use.
train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')
# training the PunktSentenceTokenizer
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
def process_content():
try:
for i in tokenized[:5]:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
print(tagged)
except Exception as e:
print(str(e))
process_content()
# another example
sent = 'My name is Amit. I am currently studying Computer Science in IIT Kharagpur.'
words = word_tokenize(sent)
tagged_words = nltk.pos_tag(words)
print(tagged_words)
Uses POS Tagging and Regular Expressions
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
# This tokenizer(PunktSentenceTokenizer) is capable of unsupervised machine learning, so you can actually train it on any body of text that you use.
train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')
# training the PunktSentenceTokenizer
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
def process_content():
try:
for i in tokenized[:5]:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
chunkParser = nltk.RegexpParser(chunkGram)
chunked = chunkParser.parse(tagged)
chunked.draw()
print(chunked)
except Exception as e:
print(str(e))
process_content()
A very similar operation to stemming is called lemmatizing. The major difference between these is, as you saw earlier, stemming can often create non-existent words, whereas lemmas are actual words.
So, your root stem, meaning the word you end up with, is not something you can just look up in a dictionary, but you can look up a lemma.
Some times you will wind up with a very similar word, but sometimes, you will wind up with a completely different word.
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))
Here, we've got a bunch of examples of the lemma for the words that we use. The only major thing to note is that lemmatize takes a part of speech parameter, "pos." If not supplied, the default is "noun." This means that an attempt will be made to find the closest noun, which can create trouble for you. Keep this in mind if you use lemmatizing!
https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html
Stemming usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma . If confronted with the token saw, stemming might return just s, whereas lemmatization would attempt to return either see or saw depending on whether the use of the token was as a verb or a noun. The two may also differ in that stemming most commonly collapses derivationally related words, whereas lemmatization commonly only collapses the different inflectional forms of a lemma.
from nltk.corpus import wordnet
# synonyms of 'program'
syns = wordnet.synsets('program')
print(syns[0].name())
print(syns[0].lemmas()[0].name())
print(syns[0].definition()) # definition
print(syns[0].examples()) # examples
synonys = []
antonyms = []
for syn in wordnet.synsets("good"):
for l in syn.lemmas():
synonyms.append(l.name())
if l.antonyms():
antonyms.append(l.antonyms()[0].name())
print(set(synonyms))
print(set(antonyms))
# semantic similarity
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
print(w1.wup_similarity(w2))
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('car.n.01')
print(w1.wup_similarity(w2))
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('cat.n.01')
print(w1.wup_similarity(w2))
w1 = wordnet.synset('love.n.01')
w2 = wordnet.synset('fat.n.01')
print(w1.wup_similarity(w2))