import pandas as pd def split_csv_by_percent(csv_path, train=70, val=15, test=15, seed=42): # 1) Read and shuffle rows with a fixed seed for reproducibility df = pd.read_csv(csv_path).sample(frac=1, random_state=seed).reset_index(drop=True) # 2) Turn the three inputs into proportions relative to their sum total = train + val + test # eheh you got it there :p n = len(df) n_train = int(n * train / total) # floor to keep indices integral n_val = int(n * val / total) # 3) Give the remainder to test to ensure every row is assigned # (this naturally absorbs any rounding loss) train_df = df.iloc[:n_train].reset_index(drop=True) val_df = df.iloc[n_train:n_train + n_val].reset_index(drop=True) test_df = df.iloc[n_train + n_val:].reset_index(drop=True) return train_df, val_df, test_df # usage: DATASET = "Assets/Dataset/Tmp/rdf_text.csv" TRAIN = "Assets/Dataset/Tmp/hold_out/train.csv" TEST = "Assets/Dataset/Tmp/hold_out/test.csv" EVALUATION = "Assets/Dataset/Tmp/hold_out/evaluation.csv" train_df, val_df, test_df = split_csv_by_percent(DATASET, train=80, val=10, test=10, seed=42) train_df.to_csv(TRAIN) val_df.to_csv(EVALUATION) test_df.to_csv(TEST)