Naive Bayes Realized By Python

2021-07-21

字数统计: 678字 | 阅读时长≈ 4分

About this problem , see the lecture in machine learning course of USTC.
This exercise is taken from HW4.

Programming Exercise: Naive Bayes

We provide you with a data set that contains spam and non-spam emails (“hw4 nb.zip”). Please use the Naive Bayes Classifier to detect the spam emails. Finish the following exercises by programming. You can use your favorite programming language.

Remove all the tokens that contain non-alphabetic characters.
Train the Naive Bayes Classifier on the training set according to Algorithm 1.
Test the Naive Bayes Classifier on the test set according to Algorithm 2.
Compute the confusion matrix, precision, recall, and F1 score. Please report your result.

QQ截图20210721183700

Remove non-alphabelta tokens

In this session, I simply split all the emails by delimiter “ “ and “\n” , Thus, we get all the tokens, and we filter all the None characters ( since it is not meaningful for our classification). Finally , just skip all the tokens with non-alphabelta characters.

Training and Testing

All the codes are as follows

import numpy as np
import os
import string
import re
import decimal

FD = ["test-mails", "train_mails"]
charset = string.ascii_letters


def remove_nonalphabelta_character(folder):
    files = os.listdir(folder)
    rm_path = folder+"_RM"
    exist_fol = os.path.exists(rm_path)
    if not exist_fol:
        os.makedirs(rm_path)
    for f in files:
        file_path = folder+"\\"+f
        f_in = open(file_path, "r")
        f_out = open(rm_path+"\\"+f, "w")
        words = re.split("\n| ", f_in.read())
        words = list(filter(None, words))[1:]
        for word in words:
            flag = 0
            for i in word:
                if i not in charset:
                    flag = 1
                    break
            if flag:
                continue
            f_out.write(word+" ")


def train_naive_bayes(folder):
    files = os.listdir(folder)
    dic = []
    spm_num = []
    dic_num = []
    sp = 0
    msg = 0
    D = len(files)
    for f in files:
        if "spmsg" in f:
            sp = sp+1
        else:
            msg = msg+1
        words = re.split("\n| ", open(folder+"//"+f, "r").read())
        words = list(filter(None, words))

        for word in words:
            if word not in dic:
                dic.append(word)
                dic_num.append(1)
                spm_num.append(0)
            else:
                n = dic.index(word)
                dic_num[n] = dic_num[n]+1
            if "spmsg" in f:
                spm_num[dic.index(word)] = spm_num[dic.index(word)]+1

    # 0 label = spam
    # 1 label = msg
    Pc0 = sp/D
    Pc1 = msg/D
    V_len = len(dic)
    p0k = [.0]*V_len
    p1k = [.0]*V_len
    V0 = sum(spm_num)
    V1 = sum(dic_num)-V0
    for i in range(V_len):
        p0k[i] = (spm_num[i]+1)/(V0+V_len)
        p1k[i] = (dic_num[i]-spm_num[i]+1)/(V1+V_len)
    return Pc0, Pc1, dic, p0k, p1k


def test_naive_bayes(email, Pc0, Pc1, dic, p0k, p1k):
    f = re.split("\n| ", open(email, "r").read())
    I = []
    x = list(filter(None, f))
    for i in x:
        if i in dic and (i not in I):
            I.append(i)
    decimal.getcontext().prec = 2048  # for 'precise enough' floats
    predict_0 = decimal.Decimal(Pc0)
    predict_1 = decimal.Decimal(Pc1)
    for i in I:
        n = dic.index(i)
        predict_0 = predict_0*decimal.Decimal(p0k[n])
        predict_1 = predict_1*decimal.Decimal(p1k[n])
    # print(predict_0,predict_1)
    return predict_0 > predict_1


# remove_nonalphabelta_character("train-mails")
# remove_nonalphabelta_character("test-mails")

Pc0, Pc1, dic, p0k, p1k = train_naive_bayes("train-mails_RM")
files = os.listdir("test-mails_RM")
N = len(files)

false_negative = false_positive = 0
sp = msg = 0
for f in files:
    path = "test-mails_RM//"+f
    t = test_naive_bayes(path, Pc0, Pc1, dic, p0k, p1k)
    if "spmsg" in f:
        sp = sp+1
        if t != True:
            print(True, t)
            false_negative = false_negative+1
    else:
        msg = msg+1
        if t != False:
            print(False, t)
            false_positive = false_positive+1
print(sp)
precision = (sp-false_negative)/(sp-false_negative+false_positive)
recall = (sp-false_negative)/sp
F_score=2/(1/precision+1/recall)
print(precision,recall,F_score)
# output
# False True
# True False
# 49
# 0.9795918367346939 0.9795918367346939 0.979591836734694

Results

As we can see, the results on the test sets are rather good. Only one spam email is classified as normal email and only one normal email is incorrectly classified as spam.

Thus the total result is as follows:

output
False True
True False
49
precision		   recall 			  F_score
0.9795918367346939 0.9795918367346939 0.979591836734694