30 lines
1.2 KiB
Python
30 lines
1.2 KiB
Python
|
|
import pandas as pd
|
||
|
|
|
||
|
|
def split_csv_by_percent(csv_path, train=70, val=15, test=15, seed=42):
|
||
|
|
# 1) Read and shuffle rows with a fixed seed for reproducibility
|
||
|
|
df = pd.read_csv(csv_path).sample(frac=1, random_state=seed).reset_index(drop=True)
|
||
|
|
|
||
|
|
# 2) Turn the three inputs into proportions relative to their sum
|
||
|
|
total = train + val + test # eheh you got it there :p
|
||
|
|
n = len(df)
|
||
|
|
n_train = int(n * train / total) # floor to keep indices integral
|
||
|
|
n_val = int(n * val / total)
|
||
|
|
# 3) Give the remainder to test to ensure every row is assigned
|
||
|
|
# (this naturally absorbs any rounding loss)
|
||
|
|
train_df = df.iloc[:n_train].reset_index(drop=True)
|
||
|
|
val_df = df.iloc[n_train:n_train + n_val].reset_index(drop=True)
|
||
|
|
test_df = df.iloc[n_train + n_val:].reset_index(drop=True)
|
||
|
|
|
||
|
|
return train_df, val_df, test_df
|
||
|
|
|
||
|
|
# usage:
|
||
|
|
DATASET = "Assets/Dataset/Tmp/rdf_text.csv"
|
||
|
|
TRAIN = "Assets/Dataset/Tmp/hold_out/train.csv"
|
||
|
|
TEST = "Assets/Dataset/Tmp/hold_out/test.csv"
|
||
|
|
EVALUATION = "Assets/Dataset/Tmp/hold_out/evaluation.csv"
|
||
|
|
train_df, val_df, test_df = split_csv_by_percent(DATASET, train=80, val=10, test=10, seed=42)
|
||
|
|
|
||
|
|
train_df.to_csv(TRAIN)
|
||
|
|
val_df.to_csv(EVALUATION)
|
||
|
|
test_df.to_csv(TEST)
|