NanoSocrates/Scripts/DataCleaning/hold_out/divide.py

import pandas as pd

def split_csv_by_percent(csv_path, train=70, val=15, test=15, seed=42):
    # 1) Read and shuffle rows with a fixed seed for reproducibility
    df = pd.read_csv(csv_path).sample(frac=1, random_state=seed).reset_index(drop=True)

    # 2) Turn the three inputs into proportions relative to their sum
    total = train + val + test # eheh you got it there :p
    n = len(df)
    n_train = int(n * train / total)   # floor to keep indices integral
    n_val   = int(n * val   / total)
    # 3) Give the remainder to test to ensure every row is assigned
    #    (this naturally absorbs any rounding loss)
    train_df = df.iloc[:n_train].reset_index(drop=True)
    val_df   = df.iloc[n_train:n_train + n_val].reset_index(drop=True)
    test_df  = df.iloc[n_train + n_val:].reset_index(drop=True)

    return train_df, val_df, test_df

# usage:
DATASET =       "Assets/Dataset/Tmp/rdf_text.csv"
TRAIN =         "Assets/Dataset/Tmp/hold_out/train.csv"
TEST =          "Assets/Dataset/Tmp/hold_out/test.csv"
EVALUATION =    "Assets/Dataset/Tmp/hold_out/evaluation.csv"
train_df, val_df, test_df = split_csv_by_percent(DATASET, train=80, val=10, test=10, seed=42)

train_df.to_csv(TRAIN)
val_df.to_csv(EVALUATION)
test_df.to_csv(TEST)
add divide method to create hold out dataset 2025-10-11 16:49:36 +02:00			`import pandas as pd`

			`def split_csv_by_percent(csv_path, train=70, val=15, test=15, seed=42):`
			`# 1) Read and shuffle rows with a fixed seed for reproducibility`
			`df = pd.read_csv(csv_path).sample(frac=1, random_state=seed).reset_index(drop=True)`

			`# 2) Turn the three inputs into proportions relative to their sum`
			`total = train + val + test # eheh you got it there :p`
			`n = len(df)`
			`n_train = int(n * train / total) # floor to keep indices integral`
			`n_val = int(n * val / total)`
			`# 3) Give the remainder to test to ensure every row is assigned`
			`# (this naturally absorbs any rounding loss)`
			`train_df = df.iloc[:n_train].reset_index(drop=True)`
			`val_df = df.iloc[n_train:n_train + n_val].reset_index(drop=True)`
			`test_df = df.iloc[n_train + n_val:].reset_index(drop=True)`

			`return train_df, val_df, test_df`

			`# usage:`
			`DATASET = "Assets/Dataset/Tmp/rdf_text.csv"`
			`TRAIN = "Assets/Dataset/Tmp/hold_out/train.csv"`
			`TEST = "Assets/Dataset/Tmp/hold_out/test.csv"`
			`EVALUATION = "Assets/Dataset/Tmp/hold_out/evaluation.csv"`
			`train_df, val_df, test_df = split_csv_by_percent(DATASET, train=80, val=10, test=10, seed=42)`

			`train_df.to_csv(TRAIN)`
			`val_df.to_csv(EVALUATION)`
			`test_df.to_csv(TEST)`