So, I am working on a small project that detects fake news. This is the model:
from wordcloud import STOPWORDS
from sklearn.metrics import confusion_matrix
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd #data processing
import pickle
import re
import nltk
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train = train.fillna(" ")
test = test.fillna(" ")
test["total"] = test["title"]+' '+test["author"]+test["text"]
train["total"] = train["title"]+' '+train['author']+train["text"]
real_words = ''
fake_words = ''
stopwords = set(STOPWORDS)
for val in train[train['label']==1].total:
tokens = val.split()
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
real_words += " ".join(tokens)+" "
for val in train[train['label']==0].total:
tokens = val.split()
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
fake_words += " ".join(tokens)+" "
lemmatizer = WordNetLemmatizer()
for index,row in train.iterrows():
filter_sentence = ''
sentence = row['total']
sentence = re.sub(r'[^ws]','', sentence)
words = nltk.word_tokenize(sentence)
words = [w for w in words if w not in stopwords]
for word in words:
filter_sentence += ' ' + str(lemmatizer.lemmatize(word)).lower()
train.loc[index, 'total'] = filter_sentence
train = train[['total', 'label']]
X_train = train['total']
Y_train = train['label']
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(X_train)
freq_term_matrix = count_vectorizer.transform(X_train)
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
tf_idf_matrix = tfidf.fit_transform(freq_term_matrix)
test_counts = count_vectorizer.transform(test['total'].values)
test_tfidf = tfidf.transform(test_counts)
X_train, X_test, y_train, y_test = train_test_split(tf_idf_matrix, Y_train, random_state=0)
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
pred = logreg.predict(X_test)
print('Accuracy on training set: {:.2f}'.format(logreg.score(X_train, y_train)))
print('Accuracy on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
cm = confusion_matrix(y_test, pred)
print(cm)
from sklearn.svm import SVC
sv = SVC(kernel='linear').fit(X_train, y_train)
pickle.dump(sv, open('fake.pkl', 'wb'))
This is the Flask file:
from flask import Flask, render_template, request
import numpy as np
import pickle
app = Flask(__name__)
model = pickle.load(open('fake.pkl', 'rb'))
@app.route('/')
def man():
return render_template('home.html')
@app.route('/predict', methods=['POST'])
def home():
data1 = request.form['title']
data2 = request.form['author']
data3 = request.form['news']
arr = np.array([[data1, data2, data3]])
pred = model.predict(arr)
return render_template('result.html', data=pred)
if __name__ == "__main__":
app.run(debug=True)
Whenever I give the input and run it, it shows:
ValueError: could not convert string to float: 'Specter of Trump Loosens Tongues, if Not Purse Strings, in Silicon Valley - The New York Times
What can I do to fix this? Can someone please explain? Thanks :)
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…