NanoSocrates/Scripts/DataCleaning/hold_out/divide.py

import pandas as pd

def split_csv_by_percent(csv_path, train=70, val=15, test=15, seed=42):
    # 1) Read and shuffle rows with a fixed seed for reproducibility
    df = pd.read_csv(csv_path).sample(frac=1, random_state=seed).reset_index(drop=True)

    # 2) Turn the three inputs into proportions relative to their sum
    total = train + val + test # eheh you got it there :p
    n = len(df)
    n_train = int(n * train / total)   # floor to keep indices integral
    n_val   = int(n * val   / total)
    # 3) Give the remainder to test to ensure every row is assigned
    #    (this naturally absorbs any rounding loss)
    train_df = df.iloc[:n_train].reset_index(drop=True)
    val_df   = df.iloc[n_train:n_train + n_val].reset_index(drop=True)
    test_df  = df.iloc[n_train + n_val:].reset_index(drop=True)

    return train_df, val_df, test_df

# usage:
DATASET =       "Assets/Dataset/Tmp/rdf_text.csv"
TRAIN =         "Assets/Dataset/Tmp/hold_out/train.csv"
TEST =          "Assets/Dataset/Tmp/hold_out/test.csv"
EVALUATION =    "Assets/Dataset/Tmp/hold_out/evaluation.csv"
train_df, val_df, test_df = split_csv_by_percent(DATASET, train=80, val=10, test=10, seed=42)

train_df.to_csv(TRAIN)
val_df.to_csv(EVALUATION)
test_df.to_csv(TEST)