import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

data = pd.read_csv('Inne/SMSSpamCollection', header=None, names=["etykieta","wiadomosc"], delimiter="\t")
data['etykieta'] = data['etykieta'].replace('ham', 'nie-spam')
data.info()

X = data['wiadomosc']
y = data['etykieta']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
print("TD-IF Zestaw szkoleniowy:", X_train_tfidf.shape) # Wypisze: (3900, 7263)
print("TD-IF Zestaw testowy :", X_test_tfidf.shape)     # Wypisze  (1672, 7263)

classifier = LinearSVC(dual='auto')
classifier.fit(X_train_tfidf, y_train)

y_pred = classifier.predict(X_test_tfidf)

print("Macierz konfuzji:")
df = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred), index=['nie-spam','spam'], columns=['nie-spam','spam'])
print(df)
print("Accuracy score:", metrics.accuracy_score(y_test, y_pred))
print("Raport klasyfikacji dla SVM:")
print(classification_report(y_test, y_pred))