In [1]:
!pip install -q iterative-stratification
In [52]:
import pandas as pd
from sklearn.model_selection import KFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
df = pd.read_csv('/root/.cache/data/train.csv')
In [53]:
dfx = pd.get_dummies(df, columns=["discourse_type"]).head(5)
dfx
Out[53]:
In [54]:
dfx = pd.get_dummies(df, columns=["discourse_type"]).groupby(["id"], as_index=False).sum()
dfx
Out[54]:
In [55]:
dfx.columns
Out[55]:
In [56]:
cols = [c for c in dfx.columns if c.startswith("discourse_type") or c =="id" and c != "discourse_type_num"]
cols
Out[56]:
In [57]:
dfx = dfx[cols]
dfx
Out[57]:
In [58]:
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
labels = [c for c in dfx.columns if c != "id"]
dfx_labels = dfx[labels]
dfx_labels
Out[58]:
In [59]:
dfx["kfold"] = -1
In [86]:
df = pd.read_csv('/root/.cache/data/train.csv')
dfx = pd.get_dummies(df, columns=["discourse_type"]).groupby(["id"], as_index=False).sum()
cols = [c for c in dfx.columns if c.startswith("discourse_type_") or c == "id" and c != "discourse_type_num"]
dfx = dfx[cols]
mskf = MultilabelStratifiedKFold(n_splits=10, shuffle=True, random_state=42)
labels = [c for c in dfx.columns if c != "id"]
dfx_labels = dfx[labels]
dfx["kfold"] = -1
dfx
Out[86]:
In [87]:
for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
print(len(trn_), len(val_))
dfx.loc[val_, "kfold"] = fold
df = df.merge(dfx[["id", "kfold"]], on="id", how="left")
print(df.kfold.value_counts())
# df.to_csv("train_folds.csv", index=False)
In [88]:
df.groupby(["kfold"]).count()
Out[88]:
In [ ]:
Comments