2025-10-11 16:49:36 +02:00

30 lines
1.2 KiB
Python

import pandas as pd
def split_csv_by_percent(csv_path, train=70, val=15, test=15, seed=42):
# 1) Read and shuffle rows with a fixed seed for reproducibility
df = pd.read_csv(csv_path).sample(frac=1, random_state=seed).reset_index(drop=True)
# 2) Turn the three inputs into proportions relative to their sum
total = train + val + test # eheh you got it there :p
n = len(df)
n_train = int(n * train / total) # floor to keep indices integral
n_val = int(n * val / total)
# 3) Give the remainder to test to ensure every row is assigned
# (this naturally absorbs any rounding loss)
train_df = df.iloc[:n_train].reset_index(drop=True)
val_df = df.iloc[n_train:n_train + n_val].reset_index(drop=True)
test_df = df.iloc[n_train + n_val:].reset_index(drop=True)
return train_df, val_df, test_df
# usage:
DATASET = "Assets/Dataset/Tmp/rdf_text.csv"
TRAIN = "Assets/Dataset/Tmp/hold_out/train.csv"
TEST = "Assets/Dataset/Tmp/hold_out/test.csv"
EVALUATION = "Assets/Dataset/Tmp/hold_out/evaluation.csv"
train_df, val_df, test_df = split_csv_by_percent(DATASET, train=80, val=10, test=10, seed=42)
train_df.to_csv(TRAIN)
val_df.to_csv(EVALUATION)
test_df.to_csv(TEST)