| | #!/bin/bash -l |
| |
|
| | |
| | poetry shell |
| |
|
| | |
| | TS=$(date '+%Y%m%d%H%M%S') |
| |
|
| | CONFIG_FILE="configs/tokenization_config_${TS}.json" |
| |
|
| | |
| | cat <<EOF > "$CONFIG_FILE" |
| | { |
| | "tokenizer_name_or_path": "teddy/models/teddy_g/400M", |
| | "gene_id_column": "index", |
| | "bio_annotations": true, |
| | "disease_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_disease_mapping.json", |
| | "tissue_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_tissue_mapping.json", |
| | "cell_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_cell_mapping.json", |
| | "sex_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_sex_mapping.json", |
| | "max_shard_samples": 500, |
| | "max_seq_len": 2048, |
| | "pad_length": 2048, |
| | "add_cls": false, |
| | "bins": 0, |
| | "continuous_rank": true, |
| | "add_disease_annotation": false, |
| | "include_zero_genes": false, |
| | "load_dir": "data/processed", |
| | "save_dir": "data/tokenized" |
| | } |
| | EOF |
| |
|
| | |
| | |
| | python teddy/data_processing/tokenization/tokenization.py \ |
| | --data_path data/processed/sample_data.h5ad \ |
| | --metadata_path data/processed/sample_data_metadata.json \ |
| | --config_path "$CONFIG_FILE" |
| |
|