diff --git a/Scripts/DataCleaning/hold_out/divide.py b/Scripts/DataCleaning/hold_out/divide.py new file mode 100644 index 0000000..1b50f8a --- /dev/null +++ b/Scripts/DataCleaning/hold_out/divide.py @@ -0,0 +1,29 @@ +import pandas as pd + +def split_csv_by_percent(csv_path, train=70, val=15, test=15, seed=42): + # 1) Read and shuffle rows with a fixed seed for reproducibility + df = pd.read_csv(csv_path).sample(frac=1, random_state=seed).reset_index(drop=True) + + # 2) Turn the three inputs into proportions relative to their sum + total = train + val + test # eheh you got it there :p + n = len(df) + n_train = int(n * train / total) # floor to keep indices integral + n_val = int(n * val / total) + # 3) Give the remainder to test to ensure every row is assigned + # (this naturally absorbs any rounding loss) + train_df = df.iloc[:n_train].reset_index(drop=True) + val_df = df.iloc[n_train:n_train + n_val].reset_index(drop=True) + test_df = df.iloc[n_train + n_val:].reset_index(drop=True) + + return train_df, val_df, test_df + +# usage: +DATASET = "Assets/Dataset/Tmp/rdf_text.csv" +TRAIN = "Assets/Dataset/Tmp/hold_out/train.csv" +TEST = "Assets/Dataset/Tmp/hold_out/test.csv" +EVALUATION = "Assets/Dataset/Tmp/hold_out/evaluation.csv" +train_df, val_df, test_df = split_csv_by_percent(DATASET, train=80, val=10, test=10, seed=42) + +train_df.to_csv(TRAIN) +val_df.to_csv(EVALUATION) +test_df.to_csv(TEST)