data-science-from-scratch
data-science-from-scratch copied to clipboard
Chapter 13: Naive Bayes
I can't get the counts value.

`#TESTANDO O MODELO def get_subject_data(path):
path = r"C:\Users\mauro\Google Drive\Análise de Dados\Python\Livro_DS_Zero\Spam\20021010_spam"
data = []
# regex for stripping out the leading "Subject:" and any spaces after it
subject_regex = re.compile(r"^Subject:\s+")
# glob.glob returns every filename that matches the wildcarded path
for fn in glob.glob(path):
is_spam = "ham" not in fn
with open(fn,'r') as file:
for line in file:
if line.startswith("Subject:"):
subject = subject_regex.sub("", line).strip()
data.append((subject, is_spam))
return data
def p_spam_given_word(word_prob): word, prob_if_spam, prob_if_not_spam = word_prob return prob_if_spam / (prob_if_spam + prob_if_not_spam)
def train_and_test_model(path):
data = get_subject_data(path)
random.seed(0) # just so you get the same answers as me
train_data, test_data = split_data(data, 0.75)
classifier = NaiveBayesClassifier()
classifier.train(train_data)
classified = [(subject, is_spam, classifier.classify(subject))
for subject, is_spam in test_data]
counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
for _, is_spam, spam_probability in classified)
print(counts)
`