import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.metrics import roc_curve
df1.head()
df1[df1['in_2dom'] == True]['len'].hist(bins=40)
df1[df1['in_hmm'] == True]['len'].hist(bins=40)
hmm = pd.read_csv('./pr9/hmm_res_df.txt', sep = '\s+')
hmm = hmm.reset_index()
hmm.head()
plt.plot(hmm['score'])
plt.show()
hmm['in_2dom'] = hmm.level_0.isin(ac_l)
score = hmm["score"]
fpr, tpr, thres = roc_curve(hmm["in_2dom"], hmm["score"])
tp = tpr * hmm["in_2dom"].sum()
tn = (1 - fpr) * (~hmm["in_2dom"]).sum()
acc = (tp + tn) / hmm["in_2dom"].shape[0]
am = np.argmax(acc)
plt.plot(fpr, tpr)
plt.plot([fpr[am]], [tpr[am]], color="C1", marker="+", markersize=20)
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.text(fpr[am] + 0.03, tpr[am] - 0.07, f"threshold = {thres[am]}")
plt.show()
tp = tpr * hmm["in_2dom"].sum()
fp = fpr * (~hmm["in_2dom"]).sum()
tn = (1 - fpr) * (~hmm["in_2dom"]).sum()
fn = (1 - tpr) * hmm["in_2dom"].sum()
f1 = 2*tp/(2*tp+fp+fn)
am_f1 = np.argmax(f1) # максимум F1
thres[am_f1]
plt.plot(fpr, tpr)
plt.plot([fpr[am_f1]], [tpr[am_f1]], color="C1", marker="+", markersize=20)
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.text(fpr[am_f1] + 0.03, tpr[am_f1] - 0.15, f"threshold = {thres[am_f1]}\nf1 = {f1.max():.3f}")
plt.show()