From d1ff88da82942484e16621801a32c8ca6e5b3a21 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Tue, 7 Oct 2025 20:44:40 +0200 Subject: [PATCH] Added small dataset --- Assets/Dataset/1-hop/curated/corpus.txt | 3 +++ Assets/Dataset/1-hop/small/corpus.txt | 3 +++ Assets/Dataset/1-hop/small/rdf_completation.csv | 3 +++ Assets/Dataset/1-hop/small/rdf_text.csv | 3 +++ Project_Model/Libs/Transformer/Utils/post_tokenization.py | 5 ++--- 5 files changed, 14 insertions(+), 3 deletions(-) create mode 100644 Assets/Dataset/1-hop/curated/corpus.txt create mode 100644 Assets/Dataset/1-hop/small/corpus.txt create mode 100644 Assets/Dataset/1-hop/small/rdf_completation.csv create mode 100644 Assets/Dataset/1-hop/small/rdf_text.csv diff --git a/Assets/Dataset/1-hop/curated/corpus.txt b/Assets/Dataset/1-hop/curated/corpus.txt new file mode 100644 index 0000000..a3c91d4 --- /dev/null +++ b/Assets/Dataset/1-hop/curated/corpus.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80a91d25999cde5410ea5a2d3b5682d72e9127957c283e7640249dc0f59f3e4d +size 65332292 diff --git a/Assets/Dataset/1-hop/small/corpus.txt b/Assets/Dataset/1-hop/small/corpus.txt new file mode 100644 index 0000000..3c6b917 --- /dev/null +++ b/Assets/Dataset/1-hop/small/corpus.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a5b951f6717d833d44ac8eefd30237ce453614cef2920707867273e05a65044 +size 13201385 diff --git a/Assets/Dataset/1-hop/small/rdf_completation.csv b/Assets/Dataset/1-hop/small/rdf_completation.csv new file mode 100644 index 0000000..815fc24 --- /dev/null +++ b/Assets/Dataset/1-hop/small/rdf_completation.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80da574017b251c9f07ecbce837d9d36a9ee8183a2a3bdbe0a2e31e22226ab79 +size 12773126 diff --git a/Assets/Dataset/1-hop/small/rdf_text.csv b/Assets/Dataset/1-hop/small/rdf_text.csv new file mode 100644 index 0000000..1041362 --- /dev/null +++ b/Assets/Dataset/1-hop/small/rdf_text.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41b30ab739a01482036c40b6560adfe751c5905ae80aafef6ee0f1a716849c68 +size 13222824 diff --git a/Project_Model/Libs/Transformer/Utils/post_tokenization.py b/Project_Model/Libs/Transformer/Utils/post_tokenization.py index 23d5e4d..bac6b6d 100644 --- a/Project_Model/Libs/Transformer/Utils/post_tokenization.py +++ b/Project_Model/Libs/Transformer/Utils/post_tokenization.py @@ -45,9 +45,8 @@ def normalize_sequence( pad_token: int, end_token: int, ) -> tuple[list[int], list[bool]]: - - new_sequence = pad_sequence(sequence, max_length, pad_token) - new_sequence = truncate_sequence(new_sequence, max_length, end_token) + new_sequence = truncate_sequence(sequence, max_length, end_token) + new_sequence = pad_sequence(new_sequence, max_length, pad_token) PADDING_MASK = create_padding_mask(new_sequence, pad_token) return (new_sequence, PADDING_MASK)