Naive Bayes classifiers are a popular statistical technique of e-mail filtering. They typically use bag of words features to identify spam e-mail, an approach commonly used in text classification.
Naive Bayes classifiers work by correlating the use of tokens (typically words, or sometimes other things), with spam and non-spam e-mails and then using Bayes' theorem to calculate a probability that an email is or is not spam.
Naive Bayes spam filtering is a baseline technique for dealing with spam that can tailor itself to the email needs of individual users and give low false positive spam detection rates that are generally acceptable to users. It is one of the oldest ways of doing spam filtering, with roots in the 1990s.
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import matplotlib.pyplot as plt
import csv
import pandas as pd
import sklearn
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split
from textblob import TextBlob
from wordcloud import WordCloud
df = pd.read_csv('./data/SMSSpamCollection' , sep='\t' , names=['status', 'message'])
df.head()
len(df)
len(df[df.status=='spam'])
df.groupby('status').describe()
getting length of each message=>
df['length'] = df['message'].map(lambda msg:len(msg))
df.head()
df['length'].plot(bins=40,kind='hist',color='orange')
#describing message length
print(df['length'].describe())
#describing HAM message by length
print('HAM',df['length'].loc[df['status']=='ham'].describe())
#describing SPAM message by length
print('SPAM',df['length'].loc[df['status']=='spam'].describe())
df['length'].plot(kind='hist',bins=20,color='red',facecolor='green',alpha=0.5,normed=1)
#length difference betwwn SPAM(1) and HAM(0)
df.hist(column='length', by='status', bins=20)
print("Largest messge is:",list(df.message[df.length>900])) #we can get this 900 from above df.describe
print("Smallest message is ",list(df.message[df.length<3])) #we can get this 3 from above df.describe
dependency:
first you need to downlaod punkt otherwise it will show you an error.>>import nltk
>>nltk.download('punkt)
>>nltk.download('wordnet')
class Split:
"""This class is for spliting data into individual words and lemmatistion of words."""
def into_tokens(self,msg):
# msg = unicode(msg, 'utf8') # convert bytes into proper unicode
return TextBlob(msg).words
def into_lemmas(self,message):
# message = unicode(message, 'utf8').lower()
words = TextBlob(message).words
# for each word, take its "base form" = lemma
return [word.lemma for word in words]
split = Split()
df.message.head().apply(split.into_tokens)
df.message.head().apply(split.into_lemmas)
# print(split_into_lemmas("go goes"))
Convert a collection of text documents to a matrix of token counts
Text Analysis is a major application field for machine learning algorithms. However the raw data, a sequence of symbols cannot be fed directly to the algorithms themselves as most of them expect numerical feature vectors with a fixed size rather than the raw text documents with variable length.
vectorizer = CountVectorizer(analyzer=split.into_lemmas)
#example
X = vectorizer.fit_transform(['hellow','hellow we are back','we we we are back'])
print(X)
# .vocabulary_ gives counts of all words in corpus
print(vectorizer.vocabulary_)
X = vectorizer.fit_transform(df['message'])
getting feature names using vectorizer.get_feature_names
vectorizer.get_feature_names()[:10] #printing 10 feature
bag_of_words = vectorizer.fit_transform(df['message'])
X = vectorizer.transform(['win win win this'])
print(bag_of_words.shape,X.shape)
Now *bag_of_word* contains full SMS coupus in the from of vector
TfidfTransformer
¶tf–idf or TFIDF, is short form of term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. The tf-idf value increases proportionally to the number of times a word appears in the document, but is often offset by the frequency of the word in the corpus, which helps to adjust for the fact that some words appear more frequently in general. Nowadays, tf-idf is one of the most popular term-weighting schemes.more
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(bag_of_words)
print(tfidf.shape)
df.loc[df["status"]=='ham',"status"]=1
df.loc[df["status"]=='spam',"status"]=0
df['status'] = df['status'].astype('int')
MultinomialNB
classifier of sklearn
class Train:
"""Traning and testing our model."""
def __init__(self):
self.clf = MultinomialNB()
def train(self,x,y):
self.clf.fit(x,y)
def score(self,x,y):
return self.clf.score(x,y)
def test(self,y):
return self.clf.predict(y)
def accuracy(self,x,y):
acc = accuracy_score(x,y,normalize=False)
print('accuracy', acc)
return acc
def probability(self,x):
prob = self.clf.predict_proba(x)
return prob
X_train, X_test, Y_train, Y_test = train_test_split(tfidf,df['status'],test_size=.1)
clf = Train()
clf.train(X_train,Y_train)
print(clf.score(X_test,Y_test))
all_predictions = clf.test(tfidf)
a = clf.probability(tfidf)
print(a)
print(all_predictions)
print('accuracy', accuracy_score(df['status'], all_predictions))
print('confusion matrix\n', confusion_matrix(df['status'], all_predictions))
print('(row=expected, col=predicted)')
message = ['attending workshop']
X = vectorizer.transform(message)
print(X.shape)
clf.test(X)
#spam = 0 , Ham = 1 generating spam hand ham wordlist
spam_wl = ' '.join(list(df[df['status']==0]['message']))
ham_wl = ' '.join(list(df[df['status']==1]['message']))
spam_wc = WordCloud(background_color='white',width=512,height=512).generate(spam_wl)
ham_wc = WordCloud(background_color='white',width=512,height=512).generate(ham_wl)
plt.figure(figsize=(10,8))
plt.imshow(spam_wc)
plt.figure(figsize=(10,8))
plt.imshow(ham_wc)
print(classification_report(df['status'], all_predictions))
plt.matshow(confusion_matrix(df['status'], all_predictions), cmap=plt.cm.binary, interpolation='nearest')
plt.title('confusion matrix')
plt.colorbar()
plt.ylabel('expected label')
plt.xlabel('predicted label')