add divide method to create hold out dataset
This commit is contained in:
parent
ee12f53f12
commit
e9d30b3cea
29
Scripts/DataCleaning/hold_out/divide.py
Normal file
29
Scripts/DataCleaning/hold_out/divide.py
Normal file
@ -0,0 +1,29 @@
|
||||
import pandas as pd
|
||||
|
||||
def split_csv_by_percent(csv_path, train=70, val=15, test=15, seed=42):
|
||||
# 1) Read and shuffle rows with a fixed seed for reproducibility
|
||||
df = pd.read_csv(csv_path).sample(frac=1, random_state=seed).reset_index(drop=True)
|
||||
|
||||
# 2) Turn the three inputs into proportions relative to their sum
|
||||
total = train + val + test # eheh you got it there :p
|
||||
n = len(df)
|
||||
n_train = int(n * train / total) # floor to keep indices integral
|
||||
n_val = int(n * val / total)
|
||||
# 3) Give the remainder to test to ensure every row is assigned
|
||||
# (this naturally absorbs any rounding loss)
|
||||
train_df = df.iloc[:n_train].reset_index(drop=True)
|
||||
val_df = df.iloc[n_train:n_train + n_val].reset_index(drop=True)
|
||||
test_df = df.iloc[n_train + n_val:].reset_index(drop=True)
|
||||
|
||||
return train_df, val_df, test_df
|
||||
|
||||
# usage:
|
||||
DATASET = "Assets/Dataset/Tmp/rdf_text.csv"
|
||||
TRAIN = "Assets/Dataset/Tmp/hold_out/train.csv"
|
||||
TEST = "Assets/Dataset/Tmp/hold_out/test.csv"
|
||||
EVALUATION = "Assets/Dataset/Tmp/hold_out/evaluation.csv"
|
||||
train_df, val_df, test_df = split_csv_by_percent(DATASET, train=80, val=10, test=10, seed=42)
|
||||
|
||||
train_df.to_csv(TRAIN)
|
||||
val_df.to_csv(EVALUATION)
|
||||
test_df.to_csv(TEST)
|
||||
Loading…
x
Reference in New Issue
Block a user