diff --git a/.gitignore b/.gitignore index 64270f0752..0a1387f56a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,51 @@ +# User Added +**.e1** +**.o1** +deps/* +OUTPUTS/* +ALCF/OUTPUTS/* +*tmp* +*core.* +*old* +*.bak +**index-cache** +**pbslogs** +ezpz +*.o17* +*.e17* +*hostfile* +.deepspeed_env +*.DS_Store +old/* +**venv** +*.json +*.o1 +*.e1 +outputs/ +venvs/ +wandb/ +llama-logs/ +checkpoints/ +*.gz +*.txt +*.idx +*.bin +*.log +__pycache__ + +.deepspeed_env +*.bak +.cache/* +outputs/ +venvs/ +wandb/ +llama-logs/ +checkpoints/ +*.gz +*.txt +*.idx +*.bin +*.log __pycache__ # Distribution / packaging @@ -20,4 +68,4 @@ slurm* logs # Data folder -bookcorpus_data/ \ No newline at end of file +bookcorpus_data/ diff --git a/ALCF/README.md b/ALCF/README.md new file mode 100644 index 0000000000..907cbd36cc --- /dev/null +++ b/ALCF/README.md @@ -0,0 +1,798 @@ +# Megatron-DeepSpeed @ ALCF + + +## 🆘 Getting Started + +> [!NOTE] +> [`train_llama_alcf.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/train_llama_alcf.sh) is the main entry point for launching +> distributed training on {Polaris, Aurora, Sunspot} @ ALCF. + + + + +## 🏃‍♂️ Running + +To launch on {`Polaris`, `Sunspot`} @ [ALCF](https://alcf.anl.gov): + +
⏳ Request an interactive job with qsub -I: + +```bash +qsub -A -q debug -l select=2 -l walltime=01:00:00,filesystems=eagle:home -I +``` + +
+ +
⬇️ Clone repo + navigate into it: + +```bash +git clone "https://github.com/argonne-lcf/Megatron-DeepSpeed" +cd Megatron-DeepSpeed +``` + +
+ +
🐍 Setup Python: + +1. 📂 Load `conda` module and activate base environment: + + - **Polaris**: + + ```bash + module use /soft/modulefiles ; module load conda ; conda activate base + ``` + + - **Sunspot**: + + ```bash + source ALCF/sunspot-env-2024-04-15-002.sh + ``` + +3. 👻 Create virtual environment _on top of the base `conda`_[^venv]: + + ```bash + export PBS_O_WORKDIR=$(pwd) && source ALCF/helpers.sh && setup_venv_from_conda + ``` + + +4. 🍋 Install [`ezpz`](https://github.com/saforem2/ezpz): + + ```bash + mkdir deps && git clone https://github.com/saforem2/ezpz deps/ezpz + python3 -m pip install -e deps/ezpz --require-virtualenv + ``` + +[^venv]: Its generally a good practice to keep separate virtual Python environments different projects. + We provide a helper function, [`setup_venv_from_conda()`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/2f0154394bbdf3c64b4669f9d944645e2cdb8f2b/ALCF/helpers.sh#L440), + that helps take care of this for you. +
+ This will: activate (or build, if necessary) a `venv` in your working dir, + _automatically_ matching the name of your active `conda` environment (e.g. `2024-04-29`, on Polaris_. + +
+ + + +
🚀 Launch: + +In this case, train a ~ 2B Model (with 10 layers), +for 1000 iterations using the data file list in: + +[`ALCF/data-lists/polaris/books.txt`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/data-lists/polaris/books.txt) + +with a micro-batch-size of 2, with the `torch.optim.AdamW` optimizer. + +**Note** that _any_ of the options in the [`setParams`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/helpers.sh#L140) +function from [`ALCF/helpers.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/7d203596dbf14e048e756c5ee6705de7dcb22283/ALCF/helpers.sh) +can be overridden dynamically at runtime using this technique. + +```bash +PBS_O_WORKDIR=$(pwd) DATA_FILE_LIST=./ALCF/data-lists/polaris/books.txt TRAIN_ITER=1000 NLAYERS=10 MICRO_BATCH=2 OPT=adamw bash train_llama_alcf.sh +``` + +
[output]: + +
+ +
[Sunspot]: + +```bash +# [09:07:32 AM] [foremans@x1921c0s0b0n0] ~/q/llm.devkit/Megatron-DeepSpeed  main !1 ?27 q4-drop 26s ✘ INT +$ PBS_O_WORKDIR=$(pwd) DATA_FILE_LIST=./convergence_debug_small.txt bash train_llama_alcf.sh +source-ing /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/ALCF/helpers.sh +Sourcing /home/foremans/q4-drop_sunspot/llm.devkit/setenv.sh... + UMD: agama-ci-devel-736.9 successfully loaded: + UMD: graphics-compute-runtime/agama-ci-devel-736.9 +Lmod has detected the following error: The following module(s) are unknown: "gcc/12.1.0" + +Please check the spelling or version number. Also try "module spider ..." +It is also possible your cache file is out-of-date; it may help to try: + $ module --ignore_cache load "gcc/12.1.0" + +Also make sure that all modulefiles written in TCL start with the string #%Module + +Note: the module "intel_compute_runtime/release/agama-devel-647" cannot be unloaded because it was not loaded. + +Running on SunSpot !! +[python] Using: /home/foremans/miniconda3/envs/q4-drop/bin/python3 +Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env +Found ezpz! +/lus/gila/projects/Aurora_deployment/foremans/locations/sunspot/projects/saforem2/ezpz/src/ezpz/__init__.py +Has ezpz installed. Nothing to do. +Done with ezpz. +┌─────────────────────────────────────────────────────────────────── +│ Writing PBS vars to /home/foremans/.pbsenv +│ HOSTFILE: /var/spool/pbs/aux/8988430.amn-0001 +│ NHOSTS: 2 +│ NGPU_PER_HOST: 12 GPUs per host +│ NGPUS: 24 GPUs total +└─────────────────────────────────────────────────────────────────── +┌────────────────────────────────────────────────────────────────── +│ [Hosts]: +│ • [host:0] - x1921c0s0b0n0.hostmgmt2000.cm.americas.sgi.com +│ • [host:1] - x1921c0s1b0n0.hostmgmt2000.cm.americas.sgi.com +└────────────────────────────────────────────────────────────────── +┌────────────────────────────────────────────────────────────────── +│ [DIST INFO]: +│ • Loading job env from: /home/foremans/.pbsenv +│ • HOSTFILE: /var/spool/pbs/aux/8988430.amn-0001 +│ • NHOSTS: 2 +│ • NGPU_PER_HOST: 12 +│ • NGPUS (NHOSTS x NGPU_PER_HOST): 24 +│ • WORLD_SIZE: 24 +│ • DIST_LAUNCH: mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/8988430.amn-0001 +└────────────────────────────────────────────────────────────────── +┌────────────────────────────────────────────────────────────────── +│ [Launch]: +│ • Use: 'launch' (=mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/8988430.amn-0001) +│ to launch job +└────────────────────────────────────────────────────────────────── +DS_CONFIG: ds_stage2_mb4_gb96_pp1_bf16.json +ZS: 2, CPU_OPTIMIZER: , MB: 4, GB: 96, PP: 1, DTYPE: bf16!!!Please see logs at logs/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16/0404090742_x1921c0s0b0n0 +!! Caught USE_ACTIVATION_CHECKPOINTING=1 !! +!! Caught USE_ACTIVATION_CHECKPOINTING=1 !! +Calling: setData() with ./convergence_debug_small.txt +-------------------- +Updated environment: +DATA_FILE_LIST: ./convergence_debug_small.txt +NUM_DOCS: 15 + WEIGHT_SUM: 15.0 +DFL_STEM: convergence_debug_small +DATA_CACHE_PATH: /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache +-------------------- +++++++++++++++++++++++++++++++++++++++++++++++++++ +- MPICH_DIR= +- Using /home/foremans/miniconda3/envs/q4-drop/bin/python3 +- WORLD_SIZE:24 +- NCCL: nccl +- MODEL_TYPE: llama-seq4096-pp1-tp1-32layers-32heads-4096hidden +- Using DATA_FILE_LIST: ./convergence_debug_small.txt +++++++++++++++++++++++++++++++++++++++++++++++++++ +! Using /home/foremans/miniconda3/envs/q4-drop/bin/deepspeed +/home/foremans/miniconda3/envs/q4-drop/bin/ds_report:4: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html + __import__('pkg_resources').require('deepspeed==0.12.3+6ea44d02') +/home/foremans/miniconda3/envs/q4-drop/lib/python3.9/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: ''If you dont plan on using image function +ality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torch +vision` from source? + warn( +[2024-04-04 09:07:45,585] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to xpu (auto detect) +[2024-04-04 09:07:45,818] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to xpu (auto detect) +-------------------------------------------------- +DeepSpeed C++/CUDA extension op report +-------------------------------------------------- +NOTE: Ops not installed will be just-in-time (JIT) compiled at + runtime if needed. Op compatibility means that your system + meet the required dependencies to JIT install the op. +-------------------------------------------------- +JIT compiled ops requires ninja +ninja .................. [OKAY] +-------------------------------------------------- +op name ................ installed .. compatible +-------------------------------------------------- +async_io ............... [NO] ....... [OKAY] +cpu_adagrad ............ [NO] ....... [OKAY] +cpu_adam ............... [NO] ....... [OKAY] +flash_attn ............. [NO] ....... [OKAY] +fused_adam ............. [NO] ....... [OKAY] +quantizer .............. [NO] ....... [OKAY] +transformer ............ [NO] ....... [OKAY] +transformer_inference .. [NO] ....... [OKAY] +utils .................. [NO] ....... [OKAY] +-------------------------------------------------- +DeepSpeed general environment info: +torch install path ............... ['/home/foremans/miniconda3/envs/q4-drop/lib/python3.9/site-packages/torch'] +torch version .................... 2.1.0a0+cxx11.abi +deepspeed install path ........... ['/lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/DeepSpeed/deepspeed'] +deepspeed info ................... 0.12.3+6ea44d02, 6ea44d02, HEAD +deepspeed wheel compiled w. ...... torch 2.1 +shared memory (/dev/shm) size .... 503.18 GB + + deepspeed --hostfile /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/hostfile_deepspeed --launcher MPICH /lus/gila/projects/Aurora_deployment/ +foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/pretrain_gpt_alcf.py --bf16 --optimizer adamw --split 100,0,0 --log-interval 1 --no-bias-gelu-fusion --lr-decay +-style cosine --no-bias-dropout-fusion --no-masked-softmax-fusion --tokenizer-type Llama2Tokenizer --no-gradient-accumulation-fusion --accumulate-allreduce-grads-in-fp32 + --use-checkpoint-opt_param-scheduler --tensorboard-dir checkpoints/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16/tensorboard --log-timers-to-tensorboard --log-optimizer +-states-to-tensorboard --lr 0.0003 --save checkpoints/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16 --load checkpoints/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16 + --seq-length 4096 --num-layers 32 --hidden-size 4096 --train-iters 317892 --eval-iters 10 --distributed-backend ccl --num-attention-heads 32 --save-interval 20 +0 --eval-interval 50000 --max-position-embeddings 4096 --micro-batch-size 4 --data-file-list ./convergence_debug_small.txt --tensor-model-parallel-size 1 --global-bat +ch-size 96 --pipeline-model-parallel-size 1 --num-key-value-heads 8 --data-cache-path /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/ +.cache/convergence_debug_small/index-cache --ffn-hidden-size 11008 --tokenizer-model /home/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/ALCF/tokenizer.model --no-query- +key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear --deepspeed-activation-checkpointing --z +ero-stage=2 --deepspeed_config=ds_stage2_mb4_gb96_pp1_bf16.json --no-pipeline-parallel --deepspeed --checkpoint-activations --checkpoint-num-layers 1 |& tee logs/ds_stage2 +_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16/0404090742_x1921c0s0b0n0/output.log + +[!! NOTE] View output at: +logs/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16/0404090742_x1921c0s0b0n0/output.log + +# ... + +/gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0051_text_document.bin + creating memory view of numpy buffer... + > finished creating indexed dataset in 0.010017 seconds + number of documents: 1498927 + > dataset split: + train: + document indices in [0, 1498927) total of 1498927 documents + validation: + document indices in [1498927, 1498927) total of 0 documents + test: + document indices in [1498927, 1498927) total of 0 documents + > loading doc-idx mapping from /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/bf90c74a625ac2ee4de6e1d6f7f84fbb_doc_idx.npy + > loading sample-idx mapping from /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/bf90c74a625ac2ee4de6e1d6f7f84fbb_sample_idx.npy + > loading shuffle-idx mapping from /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/bf90c74a625ac2ee4de6e1d6f7f84fbb_shuffle_idx.npy + loaded indexed file in 0.056 seconds + total number of samples: 2318461 + total number of epochs: 8 +> loading blendable dataset index: /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/3a426af74008c22f9db24db811aad6b7_index.npy +> loading blendable dataset sample index: /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/3a426af74008c22f9db24db811aad6b7_sample_index.npy +/home/foremans/miniconda3/envs/q4-drop/lib/python3.9/site-packages/torch/utils/data/dataloader.py:557: UserWarning: This DataLoader will create 2 worker processes in total. Our suggested max number of worker in current system is 1, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary. + +[after dataloaders are built] datetime: 2024-04-04 09:09:27 +done with setup ... +(min, max) time across ranks (ms): + model-and-optimizer-setup ......................: (64818.18, 64858.22) + train/valid/test-data-iterators-setup ..........: (1968.10, 2288.56) +training ... +[before the start of training step] datetime: 2024-04-04 09:09:27 +[2024-04-04 09:09:27,718] [INFO] [checkpointing.py:540:forward] Activation Checkpointing Information +[2024-04-04 09:09:27,719] [INFO] [checkpointing.py:541:forward] ----Partition Activations False, CPU CHECKPOINTING False +[2024-04-04 09:09:27,719] [INFO] [checkpointing.py:542:forward] ----contiguous Memory Checkpointing False with 32 total layers +[2024-04-04 09:09:27,719] [INFO] [checkpointing.py:544:forward] ----Synchronization False +[2024-04-04 09:09:27,719] [INFO] [checkpointing.py:545:forward] ----Profiling time in checkpointing False +[2024-04-04 09:09:33][INFO][utils:145] - Note: detected 208 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable. +[2024-04-04 09:09:33][INFO][utils:148] - Note: NumExpr detected 208 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8. +[2024-04-04 09:09:33][INFO][utils:160] - NumExpr defaulting to 8 threads. +^[c[2024-04-04 09:09:53,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 884.11 | optimizer_gradients: 6.43 | optimizer_step: 23.44 +[2024-04-04 09:09:53,312] [INFO] [logging.py:96:log_dist] [Rank 0] step=1, skipped=0, lr=[0.00029999999999267505, 0.00029999999999267505], mom=[(0.9, 0.999), (0.9, 0.999)] +[2024-04-04 09:09:53,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 6567.68 | bwd_microstep: 17950.36 | bwd_inner_microstep: 17711.20 | bwd_allreduce_microstep: 239.11 | step_microstep: 1139.27 +[2024-04-04 09:09:53,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 6567.66 | bwd: 17950.35 | bwd_inner: 17711.19 | bwd_allreduce: 239.11 | step: 1139.29 +[Rank 0] (after 1 iterations) memory (MB) | allocated: 18244.640625 | max allocated: 41299.50146484375 | reserved: 46764.0 | max reserved: 46764.0 + iteration 1/ 317892 | consumed samples: 96 | consumed tokens: 393216 | elapsed time per iteration (ms): 25849.1 | learning rate: 3.000E-04 | global batch size: 96 | lm loss: 1.117136E+01 | loss scale: 1.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 3.714 | tokens per gpu per second(tgs): 633.832 | TFLOPs: 38.61 | +[2024-04-04 09:10:13,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 327.85 | optimizer_gradients: 6.26 | optimizer_step: 23.60 +[2024-04-04 09:10:13,619] [INFO] [logging.py:96:log_dist] [Rank 0] step=2, skipped=0, lr=[0.00029999999997070033, 0.00029999999997070033], mom=[(0.9, 0.999), (0.9, 0.999)] +[2024-04-04 09:10:13,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 4022.74 | bwd_microstep: 15738.67 | bwd_inner_microstep: 15556.80 | bwd_allreduce_microstep: 181.82 | step_microstep: 371.01 +[2024-04-04 09:10:13,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 4022.73 | bwd: 15738.66 | bwd_inner: 15556.62 | bwd_allreduce: 181.81 | step: 371.02 + iteration 2/ 317892 | consumed samples: 192 | consumed tokens: 786432 | elapsed time per iteration (ms): 20298.3 | learning rate: 3.000E-04 | global batch size: 96 | lm loss: 2.537718E+01 | loss scale: 1.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 4.729 | tokens per gpu per second(tgs): 807.159 | TFLOPs: 49.17 | +``` + +
+ +
[Polaris]: + +```bash +[09:31:35 AM] [foremans@x3112c0s13b0n0] ~/pol/p/a/Megatron-DeepSpeed  main !4 ?24 cu118-pt221 ✘ INT +$ export PBS_O_WORKDIR="$(pwd)" && DATA_FILE_LIST=./convergence_debug_small.txt DTYPE=bf16 OPT=adamw bash train_llama_alcf.sh +source-ing /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/ALCF/helpers.sh +Running on Polaris !! + +[python] Using: /eagle/datascience/foremans/miniconda3/envs/cu118-pt221/bin/python3 +Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env +Found ezpz! +/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/ezpz/src/ezpz/__init__.py +Has ezpz installed. Nothing to do. +Done with ezpz. +┌─────────────────────────────────────────────────────────────────── +│ Writing PBS vars to /home/foremans/.pbsenv +│ HOSTFILE: /var/spool/pbs/aux/1822297.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov +│ NHOSTS: 2 +│ NGPU_PER_HOST: 4 GPUs per host +│ NGPUS: 8 GPUs total +└─────────────────────────────────────────────────────────────────── +┌────────────────────────────────────────────────────────────────── +│ [Hosts]: +│ • [host:0] - x3112c0s13b0n0.hsn.cm.polaris.alcf.anl.gov +│ • [host:1] - x3112c0s13b1n0.hsn.cm.polaris.alcf.anl.gov +└────────────────────────────────────────────────────────────────── +┌────────────────────────────────────────────────────────────────── +│ [DIST INFO]: +│ • Loading job env from: /home/foremans/.pbsenv +│ • HOSTFILE: /var/spool/pbs/aux/1822297.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov +│ • NHOSTS: 2 +│ • NGPU_PER_HOST: 4 +│ • NGPUS (NHOSTS x NGPU_PER_HOST): 8 +│ • WORLD_SIZE: 8 +│ • DIST_LAUNCH: mpiexec --verbose --envall -n 8 -ppn 4 --hostfile /var/spool/pbs/aux/1822297.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov +└────────────────────────────────────────────────────────────────── +┌────────────────────────────────────────────────────────────────── +│ [Launch]: +│ • Use: 'launch' (=mpiexec --verbose --envall -n 8 -ppn 4 --hostfile /var/spool/pbs/aux/1822297.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov) +│ to launch job +└────────────────────────────────────────────────────────────────── +DS_CONFIG: ds_stage2_mb8_gb32_pp1_bf16.json +ZS: 2, CPU_OPTIMIZER: , MB: 8, GB: 32, PP: 1, DTYPE: bf16!!!Please see logs at logs/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16/0404093534_x3112c0s13b0n0 +!! Caught USE_ACTIVATION_CHECKPOINTING=1 !! +!! Caught USE_ACTIVATION_CHECKPOINTING=1 !! +Calling: setData() with ./convergence_debug_small.txt +-------------------- +Updated environment: +DATA_FILE_LIST: ./convergence_debug_small.txt +NUM_DOCS: 15 + WEIGHT_SUM: 15.0 +DFL_STEM: convergence_debug_small +DATA_CACHE_PATH: /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache +-------------------- +++++++++++++++++++++++++++++++++++++++++++++++++++ +- MPICH_DIR=/opt/cray/pe/mpich/8.1.25/ofi/gnu/9.1 +- Using /eagle/datascience/foremans/miniconda3/envs/cu118-pt221/bin/python3 +- WORLD_SIZE:8 +- NCCL: nccl +- MODEL_TYPE: llama-seq4096-pp1-tp2-32layers-32heads-4096hidden +- Using DATA_FILE_LIST: ./convergence_debug_small.txt +++++++++++++++++++++++++++++++++++++++++++++++++++ +! Using /eagle/datascience/foremans/miniconda3/envs/cu118-pt221/bin/deepspeed +[2024-04-04 09:35:35,959] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +-------------------------------------------------- +DeepSpeed C++/CUDA extension op report +-------------------------------------------------- +NOTE: Ops not installed will be just-in-time (JIT) compiled at + runtime if needed. Op compatibility means that your system + meet the required dependencies to JIT install the op. +-------------------------------------------------- +JIT compiled ops requires ninja +ninja .................. [OKAY] +-------------------------------------------------- +op name ................ installed .. compatible +-------------------------------------------------- +async_io ............... [NO] ....... [OKAY] +fused_adam ............. [NO] ....... [OKAY] +cpu_adam ............... [NO] ....... [OKAY] +cpu_adagrad ............ [NO] ....... [OKAY] +cpu_lion ............... [NO] ....... [OKAY] + [WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +evoformer_attn ......... [NO] ....... [NO] +fused_lamb ............. [NO] ....... [OKAY] +fused_lion ............. [NO] ....... [OKAY] +inference_core_ops ..... [NO] ....... [OKAY] +cutlass_ops ............ [NO] ....... [OKAY] +transformer_inference .. [NO] ....... [OKAY] +quantizer .............. [NO] ....... [OKAY] +ragged_device_ops ...... [NO] ....... [OKAY] +ragged_ops ............. [NO] ....... [OKAY] +random_ltd ............. [NO] ....... [OKAY] + [WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.2 + [WARNING] using untested triton version (2.2.0), only 1.0.0 is known to be compatible +sparse_attn ............ [NO] ....... [NO] +spatial_inference ...... [NO] ....... [OKAY] +transformer ............ [NO] ....... [OKAY] +stochastic_transformer . [NO] ....... [OKAY] +-------------------------------------------------- +DeepSpeed general environment info: +torch install path ............... ['/eagle/datascience/foremans/miniconda3/envs/cu118-pt221/lib/python3.12/site-packages/torch'] +torch version .................... 2.2.1 +deepspeed install path ........... ['/eagle/datascience/foremans/miniconda3/envs/cu118-pt221/lib/python3.12/site-packages/deepspeed'] +deepspeed info ................... 0.14.0, unknown, unknown +torch cuda version ............... 11.8 +torch hip version ................ None +nvcc version ..................... 11.8 +deepspeed wheel compiled w. ...... torch 2.2, cuda 11.8 +shared memory (/dev/shm) size .... 251.61 GB + + deepspeed --hostfile /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/hostfile_deepspeed --launcher MPICH /lus/eagle/projects/datascienc +e/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/pretrain_gpt_alcf.py --bf16 --optimizer adamw --split 100,0,0 --log-interval 1 --no-bias-gelu-fusion + --lr-decay-style cosine --no-bias-dropout-fusion --no-masked-softmax-fusion --tokenizer-type Llama2Tokenizer --no-gradient-accumulation-fusion --accumulate-allreduce- +grads-in-fp32 --use-checkpoint-opt_param-scheduler --tensorboard-dir checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16/tensorboard --log-timers-to-tensorboard - +-log-optimizer-states-to-tensorboard --lr 0.0003 --save checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16 --load checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_ +pp1_tp2_bf16 --seq-length 4096 --num-layers 32 --hidden-size 4096 --train-iters 317892 --eval-iters 10 --distributed-backend nccl --num-attention-heads 32 --s +ave-interval 200 --eval-interval 50000 --max-position-embeddings 4096 --micro-batch-size 8 --data-file-list ./convergence_debug_small.txt --tensor-model-parallel-size 2 + --global-batch-size 32 --pipeline-model-parallel-size 1 --num-key-value-heads 8 --data-cache-path /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-l +cf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache --ffn-hidden-size 11008 --tokenizer-model /home/foremans/polaris/projects/argonne-lcf/Megatron-DeepSpeed/ALCF/tokeniz +er.model --no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear --use-flash-attn-v2 + --deepspeed-activation-checkpointing --zero-stage=2 --deepspeed_config=ds_stage2_mb8_gb32_pp1_bf16.json --no-pipeline-parallel --deepspeed --checkpoint-activations --checkpoint- +num-layers 1 |& tee logs/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16/0404093534_x3112c0s13b0n0/output.log + +[!! NOTE] View output at: +logs/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16/0404093534_x3112c0s13b0n0/output.log + +# ... + +/eagle/datasets/dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0051_text_document.bin + creating memory view of numpy buffer... + > finished creating indexed dataset in 0.001280 seconds + number of documents: 1498927 + > dataset split: + train: + document indices in [0, 1498927) total of 1498927 documents + validation: + document indices in [1498927, 1498927) total of 0 documents + test: + document indices in [1498927, 1498927) total of 0 documents + > loading doc-idx mapping from /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/9217d94f3290abc2fddf9e87bff236d6_doc_idx.npy + > loading sample-idx mapping from /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/9217d94f3290abc2fddf9e87bff236d6_sample_idx.npy + > loading shuffle-idx mapping from /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/9217d94f3290abc2fddf9e87bff236d6_shuffle_idx.npy + loaded indexed file in 0.004 seconds + total number of samples: 869423 + total number of epochs: 3 +> loading blendable dataset index: /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/a815d51f6752c6f486d94194ce95fb87_index.npy +> loading blendable dataset sample index: /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/a815d51f6752c6f486d94194ce95fb87_sample_index.npy +> size of blendable dataset: 10223415 samples +> finished creating GPT datasets ... +[after dataloaders are built] datetime: 2024-04-04 09:36:07 +done with setup ... +(min, max) time across ranks (ms): + model-and-optimizer-setup ......................: (4794.78, 4795.23) + train/valid/test-data-iterators-setup ..........: (589.69, 721.20) +training ... +[before the start of training step] datetime: 2024-04-04 09:36:07 +[2024-04-04 09:36:07,407] [INFO] [checkpointing.py:539:forward] Activation Checkpointing Information +[2024-04-04 09:36:07,407] [INFO] [checkpointing.py:540:forward] ----Partition Activations False, CPU CHECKPOINTING False +[2024-04-04 09:36:07,407] [INFO] [checkpointing.py:541:forward] ----contiguous Memory Checkpointing False with 32 total layers +[2024-04-04 09:36:07,407] [INFO] [checkpointing.py:543:forward] ----Synchronization False +[2024-04-04 09:36:07,407] [INFO] [checkpointing.py:544:forward] ----Profiling time in checkpointing False +[2024-04-04 09:36:28,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1626.54 | optimizer_gradients: 19.29 | optimizer_step: 419.48 +[2024-04-04 09:36:28,430] [INFO] [logging.py:96:log_dist] [Rank 0] step=1, skipped=0, lr=[0.00029999999999267505, 0.00029999999999267505], mom=[(0.9, 0.999), (0.9, 0.999)] +[2024-04-04 09:36:28,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 11336.34 | bwd_microstep: 7134.73 | bwd_inner_microstep: 7090.02 | bwd_allreduce_microstep: 44.65 | step_microstep: 2564.02 +[2024-04-04 09:36:28,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 11336.33 | bwd: 7134.75 | bwd_inner: 7090.01 | bwd_allreduce: 44.66 | step: 2564.02 + iteration 1/ 317892 | consumed samples: 32 | consumed tokens: 131072 | elapsed time per iteration (ms): 21133.8 | learning rate: 3.000E-04 | global batch size: 32 | lm loss: 1.119983E+01 | loss scale: 1.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1.514 | tokens per gpu per second(tgs): 775.250 | TFLOPs: 47.23 | +[Rank 1] (after 1 iterations) memory (MB) | allocated: 14165.525390625 | max allocated: 22332.37255859375 | reserved: 24642.0 | max reserved: 35824.0 +[Rank 0] (after 1 iterations) memory (MB) | allocated: 14165.525390625 | max allocated: 22332.37255859375 | reserved: 24642.0 | max reserved: 32994.0 +[2024-04-04 09:36:38,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1605.55 | optimizer_gradients: 11.56 | optimizer_step: 50.92 +[2024-04-04 09:36:38,623] [INFO] [logging.py:96:log_dist] [Rank 0] step=2, skipped=0, lr=[0.00029999999997070033, 0.00029999999997070033], mom=[(0.9, 0.999), (0.9, 0.999)] +[2024-04-04 09:36:38,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1395.17 | bwd_microstep: 6832.48 | bwd_inner_microstep: 6789.73 | bwd_allreduce_microstep: 42.70 | step_microstep: 1867.64 +[2024-04-04 09:36:38,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 1395.15 | bwd: 6832.49 | bwd_inner: 6789.73 | bwd_allreduce: 42.71 | step: 1867.65 + iteration 2/ 317892 | consumed samples: 64 | consumed tokens: 262144 | elapsed time per iteration (ms): 10154.3 | learning rate: 3.000E-04 | global batch size: 32 | lm loss: 1.766422E+01 | loss scale: 1.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 3.151 | tokens per gpu per second(tgs): 1613.503 | TFLOPs: 98.29 | + +# ... +``` + +
+ +
+ +
+ + + + + + + + + + + + + +## 📝 Data Preprocessing + +
Data Pre-Processing: + +AuroraGPT is trained on the Dolma dataset (initially v0), now in the process of moving to v6. For more details on the dataset, refer to https://huggingface.co/datasets/allenai/dolma. The dolma dataset downloaded is already preprocessing to remove the duplicates (dedup) and filtering the data (mixing). For more details refer to https://github.com/allenai/dolma/tree/main/docs and https://github.com/vksastry/dolma_alcf/blob/main/ALCF/Readme.md. + +The data preprocessing of Dolma dataset before training consists of tokenization of the data using a specific tokenizer (LlamaTokenizer is what we are currently using), Use the below script to tokenize the entire dataset. Example shown for Polaris. + +``` bash +cd /eagle/datasets/dolma/utils +./tokenization.sh +``` + +
+ +## ✅ TODOs + +
+TODOs: + +- [ ] Ensure / double check that optimizer settings from `ds_config.json` aren't being overwritten by some defaults in `megatron/arguments.py` + - [ ] specifically, `momentum, beta{1, 2}, etc` + +
Completed + +- Continue runs on Polaris @ + - [x] 48 Nodes + - [x] 32 Nodes + - [x] 16 Nodes + - [x] 8 Nodes + - [x] 4 Nodes + +- [x] Then, try re-creating ( / fixing) conda with `cuda==12.1` + - 😔, failed. + +- ~~‼️ Unable to save checkpoints with `torch==2.1` + `cuda==11.8`~~: + - Fixed in [a57a21f](https://github.com/argonne-lcf/Megatron-DeepSpeed/commit/a57a21f6b2a8abf847f5ef599e1b1edcb5a5e1b5) + +
🐛 Bug + + - Training progresses OK: + + ```bash + [2024-03-07 15:27:02,646] [INFO] [timer.py:260:stop] epoch=0/micro_step=199/global_step=199, RunningAvgSamplesPerSec=58.730622229657506, CurrSamplesPerSec=61.35304005128382, MemAllocated=6.01GB, MaxMemAllocated=19.52GB + iteration 199/ 317892 | consumed samples: 152832 | consumed tokens: 625999872 | elapsed time per iteration (ms): 14287.5 | learning rate: 2.407E-04 | global batch size: 768 | lm loss: 5.905366E+00 | loss scale: 8192.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 53.753 | tokens per gpu per second (tgs): 1146.733 | TFLOPs: 69.85 | + [2024-03-07 15:27:15,063] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=4, lr=[0.000240653265864008, 0.000240653265864008], mom=[(0.9, 0.999), (0.9, 0.999)] + [2024-03-07 15:27:17,188] [INFO] [timer.py:260:stop] epoch=0/micro_step=200/global_step=200, RunningAvgSamplesPerSec=58.730745476291396, CurrSamplesPerSec=58.75503515561452, MemAllocated=6.01GB, MaxMemAllocated=19.52GB + iteration 200/ 317892 | consumed samples: 153600 | consumed tokens: 629145600 | elapsed time per iteration (ms): 14541.4 | learning rate: 2.407E-04 | global batch size: 768 | lm loss: 5.897035E+00 | loss scale: 8192.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 52.815 | tokens per gpu per second (tgs): 1126.713 | TFLOPs: 68.63 | + saving checkpoint at iteration 200 to checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb768_pp1_tp2_fp16 + # ... + ``` + + - Then crashes with: + + ```python + Traceback (most recent call last): + Traceback (most recent call last): + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/pretrain_gpt_alcf.py", line 575, in + model = main() + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/pretrain_gpt_alcf.py", line 554, in main + model = pretrain( + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 226, in pretrain + iteration = train(forward_step_func, + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 1290, in train + save_checkpoint_and_time(iteration, model, optimizer, + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 1151, in save_checkpoint_and_time + save_checkpoint(iteration, model, optimizer, opt_param_scheduler) + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/checkpointing.py", line 259, in save_checkpoint + state_dict[UNIVERSAL_CHECKPOINT_INFO] = _universal_checkpoint_info(model) + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/checkpointing.py", line 783, in _universal_checkpoint_info + info.update(model[0].universal_checkpoint_info()) + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/model/gpt_model.py", line 203, in universal_checkpoint_info + info[TP_REPLICATED_PARAMETER_PATTERNS] = self._get_tp_replicated_param_patterns() + File "/lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1695, in __getattr__ + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") + AttributeError: 'GPTModel' object has no attribute '_get_tp_replicated_param_patterns' + ``` + + 🤔 +
+ +
+ +
+ + + + + diff --git a/ALCF/aws_ofi_nccl_plugin.sh b/ALCF/aws_ofi_nccl_plugin.sh new file mode 100644 index 0000000000..ffd1471cd3 --- /dev/null +++ b/ALCF/aws_ofi_nccl_plugin.sh @@ -0,0 +1,20 @@ +#!/bin/bash --login + +# AWS NCCL OFI Plugin settings below +export NCCL_CROSS_NIC=1 +export NCCL_COLLNET_ENABLE=1 +export NCCL_NET="AWS Libfabric" +export LD_LIBRARY_PATH=/soft/libraries/aws-ofi-nccl/v1.9.1-aws/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/soft/libraries/hwloc/lib/:$LD_LIBRARY_PATH +export FI_CXI_DISABLE_HOST_REGISTER=1 +export FI_MR_CACHE_MONITOR=userfaultfd +export FI_CXI_DEFAULT_CQ_SIZE=131072 +######################################################### +# WARNING: !!! +# - Currently, `export NCCL_NET_GDR_LEVEL=PHB` +# causes a hang on Polaris. +# so, we don't set it for the time being [2024-05-14]. +# - Seems to work on Perlmutter ??? +# +# export NCCL_NET_GDR_LEVEL=PHB +######################################################### diff --git a/ALCF/data-lists/polaris/algebraic.txt b/ALCF/data-lists/polaris/algebraic.txt new file mode 100644 index 0000000000..505276d3bf --- /dev/null +++ b/ALCF/data-lists/polaris/algebraic.txt @@ -0,0 +1,16 @@ +0.0018520780893211373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document +0.0017591050606817512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document +0.001459052794333798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document +0.0007405667281569194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document +0.00019420030110896795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document +0.0009008668715801845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document +0.00015115827957143057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document +0.0014552844319220648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document +0.0012469861325685161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document +0.00136412011372413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document +0.0007064279699221103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document +0.0008472240000687427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document +0.0001984375713341955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document +0.0005472773881697123 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document +0.001815779629850992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document +0.0018313600689757324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document diff --git a/ALCF/data-lists/polaris/arxiv.txt b/ALCF/data-lists/polaris/arxiv.txt new file mode 100644 index 0000000000..cae6e2da69 --- /dev/null +++ b/ALCF/data-lists/polaris/arxiv.txt @@ -0,0 +1,100 @@ +0.0002583902668716813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document +0.0002646575141232155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document +0.0003165521247456758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document +0.0002920706460176214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document +0.00028396813182810215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document +0.00030445161883108107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document +0.00031628781276576474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document +0.0003083776568189157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document +0.0003176359471472902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document +0.0002536009369131698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document +0.0003067491424681363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document +0.0002597217257557784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document +0.0003788556450109768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document +0.0002796563272052598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document +0.00033573826524290287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document +0.00030523658022800287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document +0.00032211552192240096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document +0.0003329295675164247 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document +0.0003101982186639862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document +0.00032361798234223355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document +0.0003495541581652915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document +0.0002821637448858042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document +0.00030399523537629673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document +0.0002955658968247219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document +0.00028942158502924254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document +0.00028769546171490733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document +0.0002938111057234182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document +0.0002711150403010948 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document +0.00031130095874747565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document +0.0003002996118160777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document +0.0003732757901604459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document +0.00026784205751795894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document +0.0002799626521661984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document +0.00034334276069078164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document +0.0003582469803674965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document +0.00031094844818418623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document +0.0002766228384977191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document +0.00030297116159471485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document +0.00027033888377464685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document +0.00030090862368377933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document +0.00028543875802490955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document +0.00027559768459074204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document +0.0003182185533962886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document +0.0003311392971435837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document +0.00028751652060804325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document +0.000303466863212589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document +0.00033400462801277524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document +0.0002589234031777426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document +0.0002913508598466723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document +0.0002670572450004856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document +0.00032027399105647656 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document +0.00032188376258379377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document +0.0003161585784100882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document +0.0003184249182974135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document +0.00030381336664000807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document +0.0003190437442184283 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document +0.0002537961798200545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document +0.0003017817117223326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document +0.00028685268513240224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document +0.00031265179094451165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document +0.00034708319096986816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document +0.00026650837943080664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document +0.00034588832248507335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document +0.0002416982248399037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document +0.0003089296918222243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document +0.00029137184185700827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document +0.00026464226846800774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document +0.00030545397919456627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document +0.0003206778460448875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document +0.00030968971641110967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document +0.00023325653928600864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document +0.00030526899198338555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document +0.00035376719076633584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document +0.000290224385981026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document +0.000294650083382008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document +0.00028768858128616436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document +0.00030856965235527843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document +0.00030579942447879054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document +0.0002863101084704357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document +0.0002870032092492213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document +0.000264182727569885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document +0.0002974012367036449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document +0.00032238412143059203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document +0.00031683716893819036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document +0.00031157434937617524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document +0.0003411742735695989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document +0.00026778444816570715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document +0.0003037045797275201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document +0.00027746114370081314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document +0.00027148285946862043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document +0.00028042950114678207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document +0.0003235607816590721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document +0.0003086692227306295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document +0.00033990349455148105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document +0.00030945053208470265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document +0.00027309074552265303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document +0.00028737393506316194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document +0.0003098868328009879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document +0.0002614229162588409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document +0.0002884388407820923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document diff --git a/ALCF/data-lists/polaris/books.txt b/ALCF/data-lists/polaris/books.txt new file mode 100644 index 0000000000..195aca5339 --- /dev/null +++ b/ALCF/data-lists/polaris/books.txt @@ -0,0 +1,3 @@ +0.006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document +0.006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document +0.006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document diff --git a/ALCF/data-lists/polaris/c4.txt b/ALCF/data-lists/polaris/c4.txt new file mode 100644 index 0000000000..833b095882 --- /dev/null +++ b/ALCF/data-lists/polaris/c4.txt @@ -0,0 +1,171 @@ +0.0002406272620255565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document +0.0002404825539493424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document +0.00024062296575435581 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document +0.00024069315766818953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document +0.00024055829162263452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document +0.00024062053397343032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document +0.0002410715545206964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document +0.00024024881846087368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document +0.0002407074700790688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document +0.00024072141428809043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document +0.00024027710230872736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document +0.0002409111299205489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document +0.00024081954058275009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document +0.00024086076794990912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document +0.00024098672620832446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document +0.00024068622303333862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document +0.00024140627024291824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document +0.0002414512033594384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document +0.00024028742594941463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document +0.00024018036089269645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document +0.0002398347365034979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document +0.00024006780153485276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document +0.00024015620270419213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document +0.0002408848259695227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document +0.0002408023185278831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document +0.00024021196580140326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document +0.00024077677271297493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document +0.00024087392454668027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document +0.0002408071293824126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document +0.00024042223828845715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document +0.0002411484752360495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document +0.00023605263746465907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document +0.00023471222158326908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document +0.00023432138580287644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document +0.00023407385623382327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document +0.00023487504174367091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document +0.0002341843704976313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document +0.00023421993170282486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document +0.00023445057969132037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document +0.0002337681680073047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document +0.000234627964808109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document +0.0002338942211888584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document +0.00023403849286843386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document +0.00023405641310796305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document +0.00023349169562397965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document +0.00023381157386048856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document +0.00023388742993790587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document +0.00023363103829469813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document +0.00023421141834630477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document +0.00023420564352232565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document +0.00023367463699173143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document +0.00023344969163567033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document +0.00023372196941547188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document +0.00023399207645297834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document +0.00023357915605505856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document +0.00023337585642190864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document +0.00023385005470157914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document +0.00023301533534493465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document +0.00023377864302541782 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document +0.00023323745848621437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document +0.0002330594611151835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document +0.0002334149675026783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document +0.00023198945902291534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document +0.00023023784834634142 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document +0.00022985623060187217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document +0.0002292605284569516 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document +0.00022926593333048894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document +0.00022922766406807777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document +0.00022898153911167426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document +0.0002292473111593315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document +0.000228804579400424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document +0.00022865485613513526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document +0.00022937426835887895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document +0.00022917388311587372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document +0.0002291660582019043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document +0.00022907895248360543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document +0.0002294617879920205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document +0.0002290452150516566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document +0.00022943405619715553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document +0.0002296271421006204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document +0.00022854791372910372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document +0.00022923123467686557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document +0.00022852404355738494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document +0.00022847798660086642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document +0.0002289604586810316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document +0.00022835479834950643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document +0.0002289149402884243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document +0.00022806655474763446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document +0.00022826296420992974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document +0.00022906829636213627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document +0.0002287628414466998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document +0.0002282673911253445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document +0.00022869309841939134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document +0.0002281540116815451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document +0.0002259755756162738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document +0.00022562331285233504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document +0.0002259061146106053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document +0.00022567670836663787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document +0.00022573165387587061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document +0.00022508514961670572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document +0.00022564642513773356 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document +0.00022563088621998788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document +0.0002250438755373707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document +0.00022524465346241134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document +0.00022531737657666812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document +0.00022444687519363458 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document +0.00022460397498596298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document +0.00022454218976501763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document +0.00022447528843671366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document +0.00022501666332178926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document +0.00022453752304377972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document +0.00022484451871163002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document +0.00022465678847154914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document +0.00022453180917044732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document +0.0002247278486823009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document +0.00022465794828242097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document +0.00022431000701925386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document +0.00022476020248460963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document +0.00022467531771795015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document +0.0002236391309945234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document +0.00022458764920536007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document +0.00022430877426744415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document +0.0002247047786127192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document +0.0002245298090400035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document +0.0002245648831396188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document +0.00022292894729820784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document +0.00022236668082957533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document +0.0002217622659895442 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document +0.00022252452726732609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document +0.00022135333211363678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document +0.0002214571757787971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document +0.0002217188139237798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document +0.00022144214894640303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document +0.00022100172806631854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document +0.00022156392409199052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document +0.00022134830143710272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document +0.00022158598922529453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document +0.00022142932483041377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document +0.00022120980907786554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document +0.00022117917738112441 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document +0.00022077089397851235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document +0.00022093265074996711 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document +0.00022091299741377004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document +0.0002205849150703338 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document +0.0002210648204787979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document +0.0002214235747364102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document +0.00022083907302221787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document +0.0002206334237915964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document +0.00022065193929912214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document +0.00022079775597767288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document +0.00022091492909963518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document +0.00022095009987097293 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document +0.0002208150577180165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document +0.00022085759102772088 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document +0.00022073789170129016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document +0.00022049322781182384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document +0.00022083270617761285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document +0.00021982452827473632 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document +0.00021899870446514259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document +0.00021890358773356361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document +0.00021875556609042841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document +0.00021861195987201226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document +0.00021856782186167455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document +0.00021912837771543515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document +0.00021900213768517756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document +0.00021871675851390374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document +0.0002180537056545586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document +0.0002188196714327129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document +0.00021851362624523464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document +0.0002183236795498736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document +7.291153618675672e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document diff --git a/ALCF/data-lists/polaris/cc.txt b/ALCF/data-lists/polaris/cc.txt new file mode 100644 index 0000000000..edf6aab8c1 --- /dev/null +++ b/ALCF/data-lists/polaris/cc.txt @@ -0,0 +1,1108 @@ +0.0003742481815405742 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document +0.00038204855962733055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document +0.00038821818392663593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document +0.00038723332988783727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document +0.00038916141142149904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document +0.00038049542523949033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document +0.0003854755539534284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document +0.00024202756466512517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document +0.0003915405155008087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document +0.0003927382151931033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document +0.0003839151202260479 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document +0.00040006817468967907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document +0.00040318965964443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document +0.0003831013019452741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document +0.00039166638383204036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document +0.00039962784023961004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document +0.00039536707853602614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document +0.0004204304698247758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document +0.00041538899178693555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document +0.00039186953333675306 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document +0.00038945837196504305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document +0.0003919951238929062 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document +0.00044377065718528966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document +0.0004407759068603017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document +0.0002487811895843715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document +0.00039349432045556636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document +0.00041223198559462343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document +0.0004036573014830213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document +0.0003825982215521807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document +0.00040386867133151386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document +0.00024460575279105167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document +0.000269029789531335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document +0.0003573757493252864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document +0.0004600876681392076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document +0.0002605354166397086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document +0.0003882502452157999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document +0.0002466747612126512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document +0.0004024726105072402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document +0.00040820631128483644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document +0.0002691094350403538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document +0.00026916830387277267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document +0.0004204663297880574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document +0.00042379698687085554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document +0.0004502169227311871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document +0.0002661708937015295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document +0.00031239486948031334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document +0.0003109054589936201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document +0.00045873053079760646 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document +0.00022904931423244635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document +0.0003813462028433663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document +0.00039188129256500874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document +0.00045124222276983765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document +0.00048138658436853695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document +0.0003944178776279866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document +0.00039941569676754006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document +0.00037952761190240494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document +0.0003944870860881476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document +0.0003891842411856621 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document +0.000387688981934861 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document +0.00039197953876258005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document +0.00039007915280311206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document +0.0003995520363699188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document +0.00039230985654592406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document +0.0003929472067173851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document +0.0003924096172671473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document +0.0003881636143629905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document +0.000389790617937084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document +0.00037351762309221023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document +0.0003630196170929407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document +0.00033532465765142113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document +0.0003076088685761823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document +0.00039463850897720803 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document +0.0002843816115231449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document +0.0002909175709416474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document +0.00028867170997202486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document +0.0002838644617723659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document +0.00029027869525543416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document +0.0002821339567560056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document +0.0002922988877045601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document +0.0002866955958315786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document +0.0002865271754558126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document +0.0002861247475618473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document +0.0002826681072408606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document +0.0002849746458282827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document +0.0002816966633435316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document +0.00026255342235948463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document +0.0002552895098829678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document +0.00025990194083107813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document +0.0002524062657685835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document +0.0002538577379748611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document +0.0002561415177406761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document +0.00026206253059694905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document +0.00026168095406910565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document +0.0002601305742008613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document +0.00025200823006814814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document +0.0003229951981263502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document +0.00037289448266476045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document +0.0003807825862179898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document +0.0003616333738191483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document +0.0003665117918907636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document +0.0003684186453633228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document +0.0003589330610806066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document +0.00036383861418030395 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document +0.000359841363355303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document +0.00036431044063050464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document +0.0003668574090358279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document +0.000362768263620199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document +0.0003501888032771077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document +0.000352401968221528 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document +0.0003541019701869794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document +0.0003628121865546891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document +0.0003752582953758773 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document +0.00037902046230424966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document +0.0003777927146925147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document +0.0003760676130509053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document +0.00034046049078755405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document +0.0003338847563259091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document +0.00033294499102761794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document +0.0004912026198265864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document +0.00032064363474664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document +0.00032154190389541214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document +0.00032309660151746207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document +0.00031181143365304544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document +0.00031046092294569104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document +0.00031150165249068046 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document +0.0003041314265988224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document +0.0003024834909739394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document +0.0003019936835833604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document +0.000292329665283177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document +0.0002867061143144972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document +0.00028443615610701707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document +0.00028462291013755945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document +0.0002793538601205013 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document +0.00027306573977044246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document +0.00027097155673336525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document +0.0002752934202112985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document +0.00043042012694697647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document +0.00047495648822986177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document +0.00047755032493473855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document +0.0004706974343933747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document +0.00046682163297771817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document +0.0004616765425874178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document +0.00030644496751628097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document +0.0002909492555358308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document +0.00027272036068261724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document +0.0004101070217315588 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document +0.0003728914338834357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document +0.00036546911442305647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document +0.0003669945482407483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document +0.0003715902407424017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document +0.00035837486406683366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document +0.0003573318538685469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document +0.0003553784893071916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document +0.0004920659809912352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document +0.0004533619411303183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document +0.00045067066057818706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document +0.00044396985139270645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document +0.00043198288204468477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document +0.00043005174223738454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document +0.00041847118430776784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document +0.00042952036375796664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document +0.00043420594647324267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document +0.0003461123241053012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document +0.0003408581597849182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document +0.00033172705422182547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document +0.0003392566490686136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document +0.00033578341518385483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document +0.0003439196710518844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document +0.00034559163447085543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document +0.00033762478642902825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document +0.00033215210055107224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document +0.00033423579608014966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document +0.0004963355016025102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document +0.0004996862761456923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document +0.0005000551829325451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document +0.0005004212610098755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document +0.00027768695585500585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document +0.00028395983854338433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document +0.00027835826303062254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document +0.0002740073176010804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document +0.0002791830529274016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document +0.0002796863816194411 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document +0.00026697453022672804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document +0.0002594197440280141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document +0.0003779565697649222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document +0.00041835823476586606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document +0.00043788493575265915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document +0.0002731731970096006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document +0.000276305847423402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document +0.0002704955773958623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document +0.0002629635944827518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document +0.000260070956974436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document +0.00025661553791456334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document +0.00025794727207576157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document +0.00025295733980001527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document +0.0003788106407021029 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document +0.0004882344027669431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document +0.0003275324309642705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document +0.0004803401856640094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document +0.00046720138323433943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document +0.00043527810307095335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document +0.00043905395741627827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document +0.00048774175867331425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document +0.00048380704121346737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document +0.0004779011848346118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document +0.00046255587581908036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document +0.00045127922880511576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document +0.0004503891485256095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document +0.0004450142332303422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document +0.00044630282482516654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document +0.00044325014465743616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document +0.0004263874842796447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document +0.0004217530913646938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document +0.000415120314341852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document +0.00040987168279144537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document +0.00033468337266607834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document +0.0003353094464683005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document +0.0004833936821707294 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document +0.00047194878988920935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document +0.0004648324126996427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document +0.0004562345003964941 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document +0.0004933203505465098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document +0.0003530166075325466 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document +0.00035368548192804685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document +0.0004872620828289663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document +0.00048293889392426456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document +0.00047936768462267655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document +0.00047821013991587545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document +0.0004660610308564753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document +0.000394683430103437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document +0.00039165053441571324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document +0.0003906936040164381 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document +0.00038074803919159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document +0.0003686529291578143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document +0.00035832920428870976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document +0.00035929024535947033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document +0.0003538226556050544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document +0.0003584167868708799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document +0.0003480507542594234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document +0.0003413709023543034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document +0.00034001304759361455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document +0.00033430532902756514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document +0.00046519252660631277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document +0.0002938876402514769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document +0.00028676090994509047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document +0.00027296150117506716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document +0.00026513502621960483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document +0.0002680081327926125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document +0.00025831225828720344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document +0.00026647037295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document +0.0002525733734572654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document +0.00025831708887575375 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document +0.00042487627444443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document +0.0004951213245023891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document +0.0004804051413177752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document +0.0004662397611340532 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document +0.0004550138655253933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document +0.00044494909122746795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document +0.0002899112253051385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document +0.0004372879736279761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document +0.0004529568099252922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document +0.00045127826158829573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document +0.0004436558176737439 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document +0.0004419233237678378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document +0.000434589215880319 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document +0.00029153613207706566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document +0.0004312458058738854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document +0.00028741854968757313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document +0.00046853200754421234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document +0.0004949145252030074 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document +0.00044459683920483167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document +0.0003836095306696336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document +0.0003789760237872398 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document +0.0003749227438304427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document +0.0003628558277173369 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document +0.00039468301394041474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document +0.00038874701821614864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document +0.0004158492456077867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document +0.00042360504554060077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document +0.00040386729844317623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document +0.00027595096702902474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document +0.00043638766787829135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document +0.0002218691596850179 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document +0.0004437566108089954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document +0.0003889996411609667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document +0.00043454421906537704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document +0.0004522564392830988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document +0.00041517835659357416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document +0.0002614360863446896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document +0.00037543522111463596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document +0.0004386190133514781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document +0.00046358333286115075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document +0.00043186261317942404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document +0.0002377581602097957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document +0.00025973334085074254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document +0.00040139099332000796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document +0.00043674860686687174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document +0.00040853289309329373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document +0.000242910191729688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document +0.0004431071731750582 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document +0.0004388092670482523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document +0.000381418866255965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document +0.0004100117296419717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document +0.00042469230366022745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document +0.00041744151905374254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document +0.00022835699906752945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document +0.0004380161085387397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document +0.00044803212381807456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document +0.00040554932796137236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document +0.0004234508646347761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document +0.00043341209652360653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document +0.00023966604734537185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document +0.000259165907316014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document +0.0004270653021833602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document +0.0004341547032162028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document +0.0004111478117275994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document +0.0004299383567984396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document +0.0004241899124590779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document +0.0004502719349364145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document +0.00038994621469645615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document +0.0003859912398894952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document +0.0004247535950310557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document +0.000386982084327716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document +0.0004196451040053251 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document +0.0004096278509782259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document +0.0004373334932695721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document +0.0004180889975240641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document +0.00042079636929672745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document +0.00038063574611812913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document +0.0003817505891515542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document +0.0004420096268860222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document +0.00039182670726410623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document +0.0003635667850372299 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document +0.00041564996472055667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document +0.000400529358757286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document +0.0003939113874958451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document +0.00039066622068940996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document +0.0004290098538807143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document +0.0004240739958197099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document +0.00040775392659215333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document +0.0004091634200396925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document +0.00042299190476617914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document +0.0003701492680344151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document +0.0003807353844384635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document +0.00038813507771983156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document +0.00040072346558408346 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document +0.0003603595180423597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document +0.00038799421353112465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document +0.00037575235582264926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document +0.0004239190342959713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document +0.0004606044799136546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document +0.00045107950652529253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document +0.0004391947201871058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document +0.0004457516661123035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document +0.0004301297170991686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document +0.00044661704164586694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document +0.0004438849846114837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document +0.0004444205734316823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document +0.0004190924165303394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document +0.00043942581131677875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document +0.00021568459798090663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document +0.0003814929225407199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document +0.0003217453179359235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document +0.00031719591470267974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document +0.00032434115726922137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document +0.0004079911120371051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document +0.000329492766381148 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document +0.0003845916162001633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document +0.0003835208964390098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document +0.00037847334157173194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document +0.00038296039903791865 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document +0.00037896336828472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document +0.00037620974396391355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document +0.00037420590727111843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document +0.000340490625886403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document +0.0003078314411035827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document +0.00034153990750656097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document +0.0003308858103982067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document +0.0003452640607156025 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document +0.00033095276418403455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document +0.0003116308995860414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document +0.00032446713226408477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document +0.0003015816821912984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document +0.00031612418775706894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document +0.0003278516344971041 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document +0.00033079446736097217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document +0.00032278977146550837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document +0.00032065272988207914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document +0.0003936696452406576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document +0.0003450109536627789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document +0.0003339787189919641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document +0.0003284303856176974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document +0.00033652677276843477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document +0.0003257822443845694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document +0.0003293985569149334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document +0.0003310360260148262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document +0.0003233770986418526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document +0.0003172280092149422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document +0.0003160674744292835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document +0.00030931090289598506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document +0.0003093173886443107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document +0.00033167847081104083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document +0.00031131501311729723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document +0.00031046608876279845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document +0.00030569235942207244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document +0.00030777943671285197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document +0.00029303314290956683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document +0.0003045824546400205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document +0.00030360880677729793 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document +0.00031646239964835433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document +0.0003129122300603785 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document +0.00031060464956661433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document +0.000311819032500067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document +0.0002977872483902282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document +0.0003009448600922438 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document +0.00028610292098537774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document +0.0002988326876216654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document +0.00028550828372819075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document +0.0002830381750875739 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document +0.0002848495855927156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document +0.0002856443760308144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document +0.00027442895344188584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document +0.0002681160554049462 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document +0.0003421482544126989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document +0.0004005872948449718 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document +0.0003930123959320308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document +0.0003867271832275778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document +0.000380805140455254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document +0.0003814769861947819 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document +0.00038025170883282324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document +0.0003738026647867475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document +0.00018960856915036276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document +0.0003697177501953134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document +0.00036674194328136693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document +0.00036447406838697555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document +0.00036686410861101255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document +0.00035915267825103423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document +0.0003624758404026675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document +0.0002822812140180794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document +0.00030620512946920813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document +0.000294249776520589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document +0.00030238536967523434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document +0.00029509593361580754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document +0.0002906912701830899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document +0.0002921944165474959 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document +0.00028358919691127954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document +0.0002813182772323272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document +0.00027442640800299205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document +0.0002747820342933984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document +0.0002747584403979717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document +0.00027499129634862444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document +0.0002712050404257197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document +0.0002616256943143254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document +0.00026769938929002815 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document +0.00038396081322727017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document +0.0003863140490027991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document +0.00037702277513203237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document +0.0003633274156107032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document +0.0003587473889240435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document +0.0003507672084278415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document +0.00033776425499780385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document +0.0003377914127574796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document +0.00032948015659161326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document +0.00033245638541392985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document +0.00031080707640648695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document +0.0002976903331149755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document +0.0002965121463725523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document +0.0002933849695266647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document +0.0002837035078508233 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document +0.00028684569079589323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document +0.0003145192320802359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document +0.0003566937253273515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document +0.0003470199109592918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document +0.0003060245312041868 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document +0.0002650817213818789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document +0.0002643604938780134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document +0.000299350876031416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document +0.0003178540797697938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document +0.000271850367887767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document +0.00031349896596549 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document +0.00031749734412765755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document +0.0003791137842391209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document +0.0003742334169957992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document +0.0003705639757351107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document +0.0003126986769797042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document +0.00031038132814561196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document +0.00036464437173804883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document +0.0003569480488951322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document +0.0003541239221619106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document +0.00035315297411308053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document +0.0003572451925404141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document +0.0003514986129411253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document +0.0003521798298425866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document +0.00034553677439244716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document +0.000349004719809412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document +0.0003468247484872769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document +0.0003465822608356558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document +0.00035410983132162007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document +0.0003487908354969444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document +0.0003479024763238147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document +0.000341412530646823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document +0.00034451316273667034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document +0.0002618849993484869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document +0.00026788679978901144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document +0.00027450670773227214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document +0.0002661273129899329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document +0.00026836569676402957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document +0.00026155876975483236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document +0.0002609276830117151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document +0.0002644161630512771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document +0.00036789208972872557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document +0.00037829849439990513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document +0.0003788894943523098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document +0.0003617207777959397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document +0.0002541334487248998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document +0.0002707945538071073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document +0.00027046282716455214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document +0.0002652443167243215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document +0.0002685859923850986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document +0.00025734961751176414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document +0.000259041720872915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document +0.00025340107274823446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document +0.00025757135121837893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document +0.00025617700500574084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document +0.0002566931670562857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document +0.0002543871190716101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document +0.00024997565589481713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document +0.0002954079779456287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document +0.00034890741135252835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document +0.0003473298137731525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document +0.0003296959618486435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document +0.0003304520061604598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document +0.00032377956175729824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document +0.00031700696295168713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document +0.0003060382346081943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document +0.0003012003005056863 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document +0.0002981074073993884 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document +0.0002922128825950705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document +0.000348901087722931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document +0.0003408286289467841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document +0.0003410649680770183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document +0.0003358524215576502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document +0.0003343661874989231 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document +0.00032810573699389156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document +0.00032261449539097497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document +0.0003162694866049203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document +0.0003158381156468853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document +0.000317376061083603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document +0.0003125788639953052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document +0.0003010105041885602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document +0.0003065865059090678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document +0.0003084275726508053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document +0.00030966560718296085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document +0.0002957728057853081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document +0.00029904164542325336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document +0.0002955358888729187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document +0.00028692976446931544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document +0.0002923476214935797 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document +0.0002893691697212419 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document +0.0002855895211981585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document +0.00027968347097626246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document +0.0002810783462604979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document +0.00027794080455729715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document +0.00034784376461416953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document +0.0003488347959010943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document +0.00034790583710250724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document +0.000345913166618151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document +0.00033801936268066675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document +0.0003290591130212315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document +0.00034051399521366823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document +0.00032470943131841784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document +0.00031679540050914276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document +0.00031814596342422325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document +0.0003156466289485036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document +0.00029985010879003633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document +0.0002905176377776361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document +0.0004206836775460856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document +0.00020660449162246918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document +0.0003461727254468087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document +0.00020592870907067763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document +0.00034173505299233005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document +0.0004052437256652738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document +0.0004080650901351697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document +0.00039778184149144276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document +0.00039046311464950275 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document +0.00039043444911071384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document +0.000388575704932843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document +0.00019737533145666597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document +0.00037610755595812403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document +0.00037315400127598317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document +0.00037415028580922163 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document +0.00036694041707212337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document +0.00018947219857306515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document +0.00037046050826533545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document +0.0003587440768559087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document +0.00034623936498708903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document +0.0003502289592617922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document +0.00034692398063649823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document +0.000339340809421849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document +0.0003360510394816983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document +0.0003354673850814145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document +0.00032937682875877047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document +0.00032844505049317715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document +0.00028287199339908627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document +0.0002795217197003578 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document +0.00028048955601883463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document +0.0002769326396439027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document +0.0002727090021299243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document +0.0002726577841024554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document +0.00026663619593455374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document +0.00026068042672138127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document +0.0002637704114326801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document +0.0002593043567100412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document +0.0002599897110113453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document +0.0002435078682758859 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document +0.0002450530071379054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document +0.00024233331983743606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document +0.0002934750947999535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document +0.00033241226364044474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document +0.00032938406090272075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document +0.00032778705403953246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document +0.00032184551480398754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document +0.00031874002264945737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document +0.0003165319685666433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document +0.00031307071173376295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document +0.00031119524184911957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document +0.0003102253344576429 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document +0.0003088976240383192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document +0.0002951410823077708 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document +0.00029772657676757413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document +0.0003056048989909935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document +0.00031991305381648026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document +0.00030890256978362426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document +0.0003109382904091933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document +0.00031035798529690644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document +0.00030741666395911753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document +0.0002989918594861846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document +0.00029569635443989434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document +0.0002973992445667285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document +0.000293397351001072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document +0.00028737817438047954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document +0.00028252738144009747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document +0.0002805511898623541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document +0.0003718020784620472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document +0.0003499713845765235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document +0.00034283547445326676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document +0.00031464759888838765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document +0.00033188946446414833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document +0.000326084432195463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document +0.0003764568303917893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document +0.0003604955598858414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document +0.0003655654554133222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document +0.00035762304033750504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document +0.00038478883950347103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document +0.00027735714341247454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document +0.00028139534607773563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document +0.00019777292251713763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document +0.000285571704874486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document +0.00028543482146244363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document +0.00019434234484256758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document +0.00027854908176986763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document +0.0002847068039566143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document +0.00028672356943064853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document +0.00027782687605808177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document +0.0002843539634105203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document +0.0002894748379090401 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document +0.0002868852440186493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document +0.0002818504885373851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document +0.00028680112812941034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document +0.00019258978168723977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document +0.00028760637934715155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document +0.0002820439443912918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document +0.0002831001054410018 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document +0.00029001901552467397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document +0.00027779449377883156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document +0.00019949837437516796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document +0.0002907306472984446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document +0.00027814858381318327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document +0.00019472790889161432 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document +0.00020472626596924125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document +0.0002870045081974301 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document +0.00019812241927078482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document +0.0002817553333369554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document +0.00027829782796642117 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document +0.00028289431732284113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document +0.0002795526296717729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document +0.00027682829988044574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document +0.0002895432402719184 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document +0.0002823174903941811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document +0.00028170972351837796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document +0.00027807915877838826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document +0.00028588515681452956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document +0.00028112324090816726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document +0.00020636178289985485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document +0.00019447255290980535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document +0.0002850824220591452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document +0.00027856429520116784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document +0.0002820880676635633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document +0.00028943902215995714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document +0.0002676366291085329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document +0.00023806333809954687 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document +0.00024526460430233455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document +0.00023876876664622726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document +0.00023379770334179805 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document +0.00024175151269138382 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document +0.00023386583242595706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document +0.00023771797150160827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document +0.0002262748967483896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document +0.0002408148346432682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document +0.00023398651720444235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document +0.00022989433874474592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document +0.00023948500543957772 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document +0.0002331594076859196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document +0.00023375132439600242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document +0.00023923410909668642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document +0.00023952796315562954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document +0.0002327466076905069 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document +0.00023082758956797212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document +0.0002240509275524448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document +0.00022798879995765268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document +0.000221172516774386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document +0.00021767045123534623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document +0.00021982832794804484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document +0.00021971626543789102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document +0.00022566565206920132 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document +0.0002181984894194856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document +0.00021831417549554653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document +0.00021601405421187145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document +0.00022275733725519607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document +0.00021847734911973986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document +0.0002243591012664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document +0.00021688758139483833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document +0.0002182953624789215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document +0.00020475155724026002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document +0.00021498078062960065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document +0.0002157914337233064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document +0.00021781838494967963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document +0.00021723242266814558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document +0.0002176782686553837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document +0.0003486179404943968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document +0.00034882846352857634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document +0.00031400868448352596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document +0.00030273484020011963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document +0.00029895889118145404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document +0.00029770764609621714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document +0.0002990181332116852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document +0.00029653733972285996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document +0.00029624649222942476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document +0.00029625609720203576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document +0.00029731928930852147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document +0.00029011721326148513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document +0.00028849788197494655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document +0.00021601278623858145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document +0.00021319599281739178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document +0.0002153325290600083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document +0.00018566946174516558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document +0.00020736824394291617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document +0.00020857419820128004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document +0.00020058526129536423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document +0.00020745812166665217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document +0.00020652171015271702 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document +0.00020643808911278608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document +0.00020040513914482103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document +0.00020598050188272898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document +0.0001969184139343296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document +0.0001972748812937012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document +0.0002038556751586195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document +0.00020245186011313464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document +0.00019950381422038783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document +0.00020837055459665258 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document +0.00020371856218246096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document +0.00019537612301625791 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document +0.00019914984508813857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document +0.0002053787713691309 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document +0.00019082100541008637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document +0.00020397153334531813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document +0.0002021462693077317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document +0.00019609357008124035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document +0.00019693256622486236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document +0.00020007239732428112 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document +0.00020467075741591954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document +0.00019584883400022932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document +0.00019135050391176972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document +0.0003362829834208298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document +0.00034013691154784095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document +0.00033215887031941976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document +0.00032681189065396707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document +0.0003149138485493094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document +0.00030179177307540077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document +0.0002923278437581119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document +0.00029470052278994486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document +0.0002994095093045731 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document +0.00029033525096085037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document +0.00029390798852496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document +0.0002916230924130842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document +0.00029419886374594913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document +0.0002865469756730764 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document +0.00021191292549942086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document +0.00021369664817409847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document +0.00021612485624266726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document +0.00022242192634588478 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document +0.00014605095659989698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document +0.00022070626106341693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document +0.0002174420774054071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document +0.00021325858963116995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document +0.0002124322999488052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document +0.0002081218896969054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document +0.0002108710211556957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document +0.00020686867095978426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document +0.00020895752681041895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document +0.00020741922266415738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document +0.0002069112657197308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document +0.00020644627473468118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document +0.00020332991338121604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document +0.0003560895677789848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document +0.00032915779111908214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document +0.00033810613317040864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document +0.00033729626594036923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document +0.00033550342864602944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document +0.00034173474024556906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document +0.000331505340748827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document +0.0003270050330117195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document +0.00032585275329172556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document +0.0003143383203190604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document +0.00031655199110388894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document +0.00030738872158476413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document +0.00030838388352699285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document +0.0003053596995351888 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document +0.00031836304739584593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document +0.000315315435873905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document +0.0003087116248965243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document +0.00030396790625537645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document +0.0003335812246032149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document +0.00034570956323095843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document +0.00034563035636675786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document +0.00033411265479076335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document +0.00034439191141692787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document +0.0003364483125496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document +0.0003299500453608033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document +0.00033163377700074837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document +0.00032638649660627673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document +0.00032616167939645234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document +0.0003205289298760723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document +0.00031939393740815355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document +0.00031593164066731296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document +0.00031928871111254405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document +0.00029670189073175004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document +0.00020517703846735904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document +0.00020128418186172073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document +0.00019662723895606717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document +0.0001981157042081407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document +0.00019703489037041608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document +0.00019079796331785068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document +0.0001909352306690079 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document +0.00018824662295261396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document +0.00019864275319325954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document +0.00018818516521649587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document +0.00018875694972812844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document +0.00018231621170645482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document +0.00018349407845798273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document +0.00018088971427746906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document +0.00018296284236327237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document +0.0001876011825819916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document +0.000329052068725176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document +0.00032223616273648536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document +0.00031272564089633955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document +0.00031621609908414494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document +0.0003117213560911235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document +0.00030218064069945934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document +0.00030658916600512085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document +0.0002915863534115821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document +0.0002940280138374372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document +0.00029067860468866085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document +0.00028529228063135635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document +0.00028336893301452256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document +0.0002794668089130099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document +0.00021681361378827842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document +0.0001484664674497246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document +0.00021950558378215133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document +0.00021806860758808645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document +0.00021819568718852282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document +0.00021626925931585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document +0.0001464536143077762 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document +0.00021432777088808917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document +0.000213473805865147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document +0.00021397067253964538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document +0.00020758957647437263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document +0.00020687124337683314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document +0.00020630057046511005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document +0.0002091166859352538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document +0.00020777355025615267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document +0.00020709287641496176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document +0.00020736464660577094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document +0.00020062246741862607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document +0.00020693207561942915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document +0.00021151004871893024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document +0.00019930249098689716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document +0.00021589710041231824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document +0.00021369204789905741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document +0.0002147099923936778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document +0.00021077531190389536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document +0.0002100509829113836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document +0.00021185362601571124 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document +0.00020722136637339565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document +0.00020300093701169531 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document +0.00019859737993313477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document +0.00019971314372100164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document +0.00019549908270269278 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document +0.00019649820843534028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document +0.00019619415513498067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document +0.00019493006120377898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document +0.00019499409035775506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document +0.00019252988593634277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document +0.00019440768268686405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document +0.00018747161324755577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document +0.0001879575932372779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document +0.00019040707058357506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document +0.0001871931095090703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document +0.00020112966223017096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document +0.00020516878165311017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document +0.00020664735191740533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document +0.00021041398572882962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document +0.00020397992929690396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document +0.0002039978580295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document +0.00020592785601142126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document +0.0001990755527445265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document +0.00019729564847798732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document +0.00019958182230527032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document +0.0001985037302636386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document +0.00020204130355115716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document +0.0002000296401958085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document +0.0001983064832295463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document +0.00019663108484195617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document +0.00019510678560556523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document +0.0001873284057063206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document +0.00019311553072495885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document +0.00034652137288816547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document +0.0002813690318850024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document +0.00027697649713138685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document +0.0002755419092534421 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document +0.0002681583054440219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document +0.00026945753192750824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document +0.00026169470768245737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document +0.00026437008960810825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document +0.0002637294838228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document +0.00026491867965088836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document +0.00025504483625138986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document +0.0002545040623796586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document +0.0002546682814073622 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document +0.00025545439487142615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document +0.0002626896557978271 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document +0.00025092040940402784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document +0.0002589154885863872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document +0.00024106160482721467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document +0.0002483289690087987 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document +0.0002388930282784437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document +0.00024006340759273874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document +0.00023765248178029045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document +0.00023061351965578936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document +0.00024954224883546477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document +0.00017861017233018525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document +0.00017810832743667658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document +0.00017599709170759497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document +0.00017462723516505223 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document +0.0002906316527068669 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document +0.00033762141066247166 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document +0.00017170670574152494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document +0.00017258674515137717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document +0.0002815386173173926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document +0.0002996845935618989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document +0.0002735268488987296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document +0.0002971738713071517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document +0.0002942690674002763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document +0.0003322222207729567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document +0.0003378721656198464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document +0.00018307262621851067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document +0.00033956081502775057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document +0.00031604820927876276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document +0.00028805657681088917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document +0.00026312293321215633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document +0.00034366936722921455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document +0.0002865256504406559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document +0.0003063615195861786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document +0.00028412791619666136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document +0.00028060835132727154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document +0.00032544974761560506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document +0.0002647177833217225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document +0.0003152621884896575 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document +0.0003054625140336913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document +0.00031183308312292263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document +0.00018175026696621178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document +0.00017699918328872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document +0.00018222339261441908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document +0.00018348005930964137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document +0.0001810735993810541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document +0.00030846441282038914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document +0.0002972326889310354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document +0.00017433421318235594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document +0.00032799458649525895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document +0.00032482130048512673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document +0.00031943465668672475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document +0.00029615593630484517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document +0.0002893126939511001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document +0.0002849288351723284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document +0.00028383906633569267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document +0.00028072526091262615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document +0.000284239564292377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document +0.0002778903109432523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document +0.0002771644389501471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document +0.0002733316182319337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document +0.00026362539185869363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document +0.0002636325383220217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document +0.00026740622442302886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document +0.0002646771971853427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document +0.0002628566720605389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document +0.0002644760695434766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document +0.0002623837702310999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document +0.00026088722976772894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document +0.0002567065374799158 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document +0.00018857382101207726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document +0.00019036580399817203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document +0.00018348828065261222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document +0.00018491851780345073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document +0.00018904887260080187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document +0.0001875609304251801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document +0.00018393034720015817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document +0.00018419795526114903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document +0.00018699955623404795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document +0.00018276256902965128 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document +0.00017698045695190812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document +0.00018104650132303642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document +0.00017758206731279688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document +0.00017131402995103497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document +0.000175944428350446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document +0.0003416745727147391 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document +0.0003163259373952889 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document +0.0002804489269172448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document +0.00028748272397403175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document +0.00027603318345630605 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document +0.000271638824679648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document +0.0002763761210210942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document +0.00026501984873172717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document +0.00026422486894694714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document +0.0002686339100849262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document +0.0002610837453940606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document +0.000260974343729353 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document +0.0002599403837029134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document +0.0002937273113238609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document +0.0003341790732600504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document +0.0002620661576600244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document +0.0003027929169239288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document +0.00031944039129326894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document +0.00019025676304139009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document +0.00018680910145009907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document +0.00034215840419416437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document +0.00018618120812119364 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document +0.00018605853095599425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document +0.00018120712626096538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document +0.00018315079292495327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document +0.00018362556449041974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document +0.0001780024456718171 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document +0.00033296526436178697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document +0.0001802398632282846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document +0.00017340263100798256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document +0.00017755840547238697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document +0.00018419413735260606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document +0.00017869518174591322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document +0.00017526271460129484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document +0.00017852168597981907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document +0.00017566536156787157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document +0.00017589867964432936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document +0.00017831487394075305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document +0.00017837310528935862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document +0.00018200908814216548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document +0.0001795136627511612 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document +0.0003414021775300033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document +0.00017177291787788502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document +0.0003441900648571877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document +0.0003394534597060673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document +0.0003236887233114832 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document +0.0001639544129688747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document +0.00019137443753211255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document +0.00018575146284680153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document +0.00019184792863440243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document +0.00018966043065679055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document +0.00017968851317035848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document +0.00018479881897661546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document +0.0001813642692683015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document +0.0001686449798983066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document +0.00018516104592230446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document +0.00031283726601066385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document +0.0003248607542883853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document +0.00031583241601202365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document +0.00031238270857730376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document +0.000307150592403979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document +0.00029443829986847044 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document +0.0002942723732234677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document +0.00023514930666443422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document +0.0020776328951453444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document +0.0021768234410538883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document +0.002106973549276289 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document +0.002110915756171751 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document +0.0017032382109816464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document +0.0019047944877712286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document +0.0019402711744016077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document +0.0006264790011223686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document +0.0017885401938106643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document + diff --git a/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt b/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt new file mode 100644 index 0000000000..6f34558ec3 --- /dev/null +++ b/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt @@ -0,0 +1,2419 @@ +0.0018520780893211373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document +0.0017591050606817512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document +0.001459052794333798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document +0.0007405667281569194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document +0.00019420030110896795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document +0.0009008668715801845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document +0.00015115827957143057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document +0.0014552844319220648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document +0.0012469861325685161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document +0.00136412011372413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document +0.0007064279699221103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document +0.0008472240000687427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document +0.0001984375713341955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document +0.0005472773881697123 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document +0.001815779629850992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document +0.0018313600689757324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document +0.0002583902668716813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document +0.0002646575141232155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document +0.0003165521247456758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document +0.0002920706460176214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document +0.00028396813182810215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document +0.00030445161883108107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document +0.00031628781276576474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document +0.0003083776568189157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document +0.0003176359471472902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document +0.0002536009369131698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document +0.0003067491424681363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document +0.0002597217257557784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document +0.0003788556450109768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document +0.0002796563272052598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document +0.00033573826524290287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document +0.00030523658022800287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document +0.00032211552192240096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document +0.0003329295675164247 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document +0.0003101982186639862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document +0.00032361798234223355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document +0.0003495541581652915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document +0.0002821637448858042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document +0.00030399523537629673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document +0.0002955658968247219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document +0.00028942158502924254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document +0.00028769546171490733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document +0.0002938111057234182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document +0.0002711150403010948 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document +0.00031130095874747565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document +0.0003002996118160777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document +0.0003732757901604459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document +0.00026784205751795894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document +0.0002799626521661984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document +0.00034334276069078164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document +0.0003582469803674965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document +0.00031094844818418623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document +0.0002766228384977191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document +0.00030297116159471485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document +0.00027033888377464685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document +0.00030090862368377933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document +0.00028543875802490955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document +0.00027559768459074204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document +0.0003182185533962886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document +0.0003311392971435837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document +0.00028751652060804325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document +0.000303466863212589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document +0.00033400462801277524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document +0.0002589234031777426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document +0.0002913508598466723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document +0.0002670572450004856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document +0.00032027399105647656 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document +0.00032188376258379377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document +0.0003161585784100882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document +0.0003184249182974135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document +0.00030381336664000807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document +0.0003190437442184283 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document +0.0002537961798200545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document +0.0003017817117223326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document +0.00028685268513240224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document +0.00031265179094451165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document +0.00034708319096986816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document +0.00026650837943080664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document +0.00034588832248507335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document +0.0002416982248399037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document +0.0003089296918222243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document +0.00029137184185700827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document +0.00026464226846800774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document +0.00030545397919456627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document +0.0003206778460448875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document +0.00030968971641110967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document +0.00023325653928600864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document +0.00030526899198338555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document +0.00035376719076633584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document +0.000290224385981026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document +0.000294650083382008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document +0.00028768858128616436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document +0.00030856965235527843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document +0.00030579942447879054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document +0.0002863101084704357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document +0.0002870032092492213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document +0.000264182727569885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document +0.0002974012367036449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document +0.00032238412143059203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document +0.00031683716893819036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document +0.00031157434937617524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document +0.0003411742735695989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document +0.00026778444816570715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document +0.0003037045797275201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document +0.00027746114370081314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document +0.00027148285946862043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document +0.00028042950114678207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document +0.0003235607816590721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document +0.0003086692227306295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document +0.00033990349455148105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document +0.00030945053208470265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document +0.00027309074552265303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document +0.00028737393506316194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document +0.0003098868328009879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document +0.0002614229162588409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document +0.0002884388407820923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document +0.0031025147279277244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document +0.003102019887362634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document +0.0009996745994661548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document +0.0002406272620255565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document +0.0002404825539493424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document +0.00024062296575435581 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document +0.00024069315766818953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document +0.00024055829162263452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document +0.00024062053397343032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document +0.0002410715545206964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document +0.00024024881846087368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document +0.0002407074700790688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document +0.00024072141428809043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document +0.00024027710230872736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document +0.0002409111299205489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document +0.00024081954058275009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document +0.00024086076794990912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document +0.00024098672620832446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document +0.00024068622303333862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document +0.00024140627024291824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document +0.0002414512033594384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document +0.00024028742594941463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document +0.00024018036089269645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document +0.0002398347365034979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document +0.00024006780153485276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document +0.00024015620270419213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document +0.0002408848259695227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document +0.0002408023185278831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document +0.00024021196580140326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document +0.00024077677271297493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document +0.00024087392454668027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document +0.0002408071293824126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document +0.00024042223828845715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document +0.0002411484752360495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document +0.00023605263746465907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document +0.00023471222158326908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document +0.00023432138580287644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document +0.00023407385623382327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document +0.00023487504174367091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document +0.0002341843704976313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document +0.00023421993170282486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document +0.00023445057969132037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document +0.0002337681680073047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document +0.000234627964808109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document +0.0002338942211888584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document +0.00023403849286843386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document +0.00023405641310796305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document +0.00023349169562397965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document +0.00023381157386048856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document +0.00023388742993790587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document +0.00023363103829469813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document +0.00023421141834630477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document +0.00023420564352232565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document +0.00023367463699173143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document +0.00023344969163567033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document +0.00023372196941547188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document +0.00023399207645297834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document +0.00023357915605505856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document +0.00023337585642190864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document +0.00023385005470157914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document +0.00023301533534493465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document +0.00023377864302541782 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document +0.00023323745848621437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document +0.0002330594611151835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document +0.0002334149675026783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document +0.00023198945902291534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document +0.00023023784834634142 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document +0.00022985623060187217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document +0.0002292605284569516 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document +0.00022926593333048894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document +0.00022922766406807777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document +0.00022898153911167426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document +0.0002292473111593315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document +0.000228804579400424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document +0.00022865485613513526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document +0.00022937426835887895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document +0.00022917388311587372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document +0.0002291660582019043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document +0.00022907895248360543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document +0.0002294617879920205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document +0.0002290452150516566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document +0.00022943405619715553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document +0.0002296271421006204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document +0.00022854791372910372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document +0.00022923123467686557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document +0.00022852404355738494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document +0.00022847798660086642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document +0.0002289604586810316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document +0.00022835479834950643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document +0.0002289149402884243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document +0.00022806655474763446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document +0.00022826296420992974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document +0.00022906829636213627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document +0.0002287628414466998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document +0.0002282673911253445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document +0.00022869309841939134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document +0.0002281540116815451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document +0.0002259755756162738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document +0.00022562331285233504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document +0.0002259061146106053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document +0.00022567670836663787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document +0.00022573165387587061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document +0.00022508514961670572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document +0.00022564642513773356 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document +0.00022563088621998788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document +0.0002250438755373707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document +0.00022524465346241134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document +0.00022531737657666812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document +0.00022444687519363458 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document +0.00022460397498596298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document +0.00022454218976501763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document +0.00022447528843671366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document +0.00022501666332178926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document +0.00022453752304377972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document +0.00022484451871163002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document +0.00022465678847154914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document +0.00022453180917044732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document +0.0002247278486823009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document +0.00022465794828242097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document +0.00022431000701925386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document +0.00022476020248460963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document +0.00022467531771795015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document +0.0002236391309945234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document +0.00022458764920536007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document +0.00022430877426744415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document +0.0002247047786127192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document +0.0002245298090400035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document +0.0002245648831396188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document +0.00022292894729820784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document +0.00022236668082957533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document +0.0002217622659895442 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document +0.00022252452726732609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document +0.00022135333211363678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document +0.0002214571757787971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document +0.0002217188139237798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document +0.00022144214894640303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document +0.00022100172806631854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document +0.00022156392409199052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document +0.00022134830143710272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document +0.00022158598922529453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document +0.00022142932483041377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document +0.00022120980907786554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document +0.00022117917738112441 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document +0.00022077089397851235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document +0.00022093265074996711 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document +0.00022091299741377004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document +0.0002205849150703338 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document +0.0002210648204787979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document +0.0002214235747364102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document +0.00022083907302221787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document +0.0002206334237915964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document +0.00022065193929912214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document +0.00022079775597767288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document +0.00022091492909963518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document +0.00022095009987097293 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document +0.0002208150577180165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document +0.00022085759102772088 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document +0.00022073789170129016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document +0.00022049322781182384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document +0.00022083270617761285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document +0.00021982452827473632 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document +0.00021899870446514259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document +0.00021890358773356361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document +0.00021875556609042841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document +0.00021861195987201226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document +0.00021856782186167455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document +0.00021912837771543515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document +0.00021900213768517756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document +0.00021871675851390374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document +0.0002180537056545586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document +0.0002188196714327129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document +0.00021851362624523464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document +0.0002183236795498736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document +7.291153618675672e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document +0.0003742481815405742 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document +0.00038204855962733055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document +0.00038821818392663593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document +0.00038723332988783727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document +0.00038916141142149904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document +0.00038049542523949033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document +0.0003854755539534284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document +0.00024202756466512517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document +0.0003915405155008087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document +0.0003927382151931033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document +0.0003839151202260479 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document +0.00040006817468967907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document +0.00040318965964443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document +0.0003831013019452741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document +0.00039166638383204036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document +0.00039962784023961004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document +0.00039536707853602614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document +0.0004204304698247758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document +0.00041538899178693555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document +0.00039186953333675306 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document +0.00038945837196504305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document +0.0003919951238929062 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document +0.00044377065718528966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document +0.0004407759068603017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document +0.0002487811895843715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document +0.00039349432045556636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document +0.00041223198559462343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document +0.0004036573014830213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document +0.0003825982215521807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document +0.00040386867133151386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document +0.00024460575279105167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document +0.000269029789531335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document +0.0003573757493252864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document +0.0004600876681392076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document +0.0002605354166397086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document +0.0003882502452157999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document +0.0002466747612126512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document +0.0004024726105072402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document +0.00040820631128483644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document +0.0002691094350403538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document +0.00026916830387277267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document +0.0004204663297880574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document +0.00042379698687085554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document +0.0004502169227311871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document +0.0002661708937015295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document +0.00031239486948031334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document +0.0003109054589936201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document +0.00045873053079760646 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document +0.00022904931423244635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document +0.0003813462028433663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document +0.00039188129256500874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document +0.00045124222276983765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document +0.00048138658436853695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document +0.0003944178776279866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document +0.00039941569676754006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document +0.00037952761190240494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document +0.0003944870860881476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document +0.0003891842411856621 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document +0.000387688981934861 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document +0.00039197953876258005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document +0.00039007915280311206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document +0.0003995520363699188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document +0.00039230985654592406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document +0.0003929472067173851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document +0.0003924096172671473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document +0.0003881636143629905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document +0.000389790617937084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document +0.00037351762309221023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document +0.0003630196170929407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document +0.00033532465765142113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document +0.0003076088685761823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document +0.00039463850897720803 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document +0.0002843816115231449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document +0.0002909175709416474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document +0.00028867170997202486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document +0.0002838644617723659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document +0.00029027869525543416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document +0.0002821339567560056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document +0.0002922988877045601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document +0.0002866955958315786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document +0.0002865271754558126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document +0.0002861247475618473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document +0.0002826681072408606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document +0.0002849746458282827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document +0.0002816966633435316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document +0.00026255342235948463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document +0.0002552895098829678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document +0.00025990194083107813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document +0.0002524062657685835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document +0.0002538577379748611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document +0.0002561415177406761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document +0.00026206253059694905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document +0.00026168095406910565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document +0.0002601305742008613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document +0.00025200823006814814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document +0.0003229951981263502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document +0.00037289448266476045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document +0.0003807825862179898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document +0.0003616333738191483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document +0.0003665117918907636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document +0.0003684186453633228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document +0.0003589330610806066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document +0.00036383861418030395 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document +0.000359841363355303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document +0.00036431044063050464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document +0.0003668574090358279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document +0.000362768263620199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document +0.0003501888032771077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document +0.000352401968221528 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document +0.0003541019701869794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document +0.0003628121865546891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document +0.0003752582953758773 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document +0.00037902046230424966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document +0.0003777927146925147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document +0.0003760676130509053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document +0.00034046049078755405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document +0.0003338847563259091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document +0.00033294499102761794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document +0.0004912026198265864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document +0.00032064363474664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document +0.00032154190389541214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document +0.00032309660151746207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document +0.00031181143365304544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document +0.00031046092294569104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document +0.00031150165249068046 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document +0.0003041314265988224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document +0.0003024834909739394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document +0.0003019936835833604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document +0.000292329665283177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document +0.0002867061143144972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document +0.00028443615610701707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document +0.00028462291013755945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document +0.0002793538601205013 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document +0.00027306573977044246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document +0.00027097155673336525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document +0.0002752934202112985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document +0.00043042012694697647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document +0.00047495648822986177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document +0.00047755032493473855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document +0.0004706974343933747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document +0.00046682163297771817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document +0.0004616765425874178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document +0.00030644496751628097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document +0.0002909492555358308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document +0.00027272036068261724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document +0.0004101070217315588 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document +0.0003728914338834357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document +0.00036546911442305647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document +0.0003669945482407483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document +0.0003715902407424017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document +0.00035837486406683366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document +0.0003573318538685469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document +0.0003553784893071916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document +0.0004920659809912352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document +0.0004533619411303183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document +0.00045067066057818706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document +0.00044396985139270645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document +0.00043198288204468477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document +0.00043005174223738454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document +0.00041847118430776784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document +0.00042952036375796664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document +0.00043420594647324267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document +0.0003461123241053012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document +0.0003408581597849182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document +0.00033172705422182547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document +0.0003392566490686136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document +0.00033578341518385483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document +0.0003439196710518844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document +0.00034559163447085543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document +0.00033762478642902825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document +0.00033215210055107224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document +0.00033423579608014966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document +0.0004963355016025102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document +0.0004996862761456923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document +0.0005000551829325451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document +0.0005004212610098755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document +0.00027768695585500585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document +0.00028395983854338433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document +0.00027835826303062254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document +0.0002740073176010804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document +0.0002791830529274016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document +0.0002796863816194411 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document +0.00026697453022672804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document +0.0002594197440280141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document +0.0003779565697649222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document +0.00041835823476586606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document +0.00043788493575265915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document +0.0002731731970096006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document +0.000276305847423402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document +0.0002704955773958623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document +0.0002629635944827518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document +0.000260070956974436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document +0.00025661553791456334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document +0.00025794727207576157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document +0.00025295733980001527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document +0.0003788106407021029 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document +0.0004882344027669431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document +0.0003275324309642705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document +0.0004803401856640094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document +0.00046720138323433943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document +0.00043527810307095335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document +0.00043905395741627827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document +0.00048774175867331425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document +0.00048380704121346737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document +0.0004779011848346118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document +0.00046255587581908036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document +0.00045127922880511576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document +0.0004503891485256095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document +0.0004450142332303422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document +0.00044630282482516654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document +0.00044325014465743616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document +0.0004263874842796447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document +0.0004217530913646938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document +0.000415120314341852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document +0.00040987168279144537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document +0.00033468337266607834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document +0.0003353094464683005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document +0.0004833936821707294 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document +0.00047194878988920935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document +0.0004648324126996427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document +0.0004562345003964941 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document +0.0004933203505465098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document +0.0003530166075325466 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document +0.00035368548192804685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document +0.0004872620828289663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document +0.00048293889392426456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document +0.00047936768462267655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document +0.00047821013991587545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document +0.0004660610308564753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document +0.000394683430103437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document +0.00039165053441571324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document +0.0003906936040164381 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document +0.00038074803919159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document +0.0003686529291578143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document +0.00035832920428870976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document +0.00035929024535947033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document +0.0003538226556050544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document +0.0003584167868708799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document +0.0003480507542594234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document +0.0003413709023543034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document +0.00034001304759361455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document +0.00033430532902756514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document +0.00046519252660631277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document +0.0002938876402514769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document +0.00028676090994509047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document +0.00027296150117506716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document +0.00026513502621960483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document +0.0002680081327926125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document +0.00025831225828720344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document +0.00026647037295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document +0.0002525733734572654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document +0.00025831708887575375 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document +0.00042487627444443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document +0.0004951213245023891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document +0.0004804051413177752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document +0.0004662397611340532 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document +0.0004550138655253933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document +0.00044494909122746795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document +0.0002899112253051385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document +0.0004372879736279761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document +0.0004529568099252922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document +0.00045127826158829573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document +0.0004436558176737439 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document +0.0004419233237678378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document +0.000434589215880319 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document +0.00029153613207706566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document +0.0004312458058738854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document +0.00028741854968757313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document +0.00046853200754421234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document +0.0004949145252030074 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document +0.00044459683920483167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document +0.0003836095306696336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document +0.0003789760237872398 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document +0.0003749227438304427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document +0.0003628558277173369 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document +0.00039468301394041474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document +0.00038874701821614864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document +0.0004158492456077867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document +0.00042360504554060077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document +0.00040386729844317623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document +0.00027595096702902474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document +0.00043638766787829135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document +0.0002218691596850179 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document +0.0004437566108089954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document +0.0003889996411609667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document +0.00043454421906537704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document +0.0004522564392830988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document +0.00041517835659357416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document +0.0002614360863446896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document +0.00037543522111463596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document +0.0004386190133514781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document +0.00046358333286115075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document +0.00043186261317942404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document +0.0002377581602097957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document +0.00025973334085074254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document +0.00040139099332000796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document +0.00043674860686687174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document +0.00040853289309329373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document +0.000242910191729688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document +0.0004431071731750582 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document +0.0004388092670482523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document +0.000381418866255965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document +0.0004100117296419717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document +0.00042469230366022745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document +0.00041744151905374254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document +0.00022835699906752945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document +0.0004380161085387397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document +0.00044803212381807456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document +0.00040554932796137236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document +0.0004234508646347761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document +0.00043341209652360653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document +0.00023966604734537185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document +0.000259165907316014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document +0.0004270653021833602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document +0.0004341547032162028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document +0.0004111478117275994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document +0.0004299383567984396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document +0.0004241899124590779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document +0.0004502719349364145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document +0.00038994621469645615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document +0.0003859912398894952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document +0.0004247535950310557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document +0.000386982084327716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document +0.0004196451040053251 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document +0.0004096278509782259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document +0.0004373334932695721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document +0.0004180889975240641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document +0.00042079636929672745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document +0.00038063574611812913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document +0.0003817505891515542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document +0.0004420096268860222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document +0.00039182670726410623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document +0.0003635667850372299 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document +0.00041564996472055667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document +0.000400529358757286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document +0.0003939113874958451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document +0.00039066622068940996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document +0.0004290098538807143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document +0.0004240739958197099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document +0.00040775392659215333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document +0.0004091634200396925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document +0.00042299190476617914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document +0.0003701492680344151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document +0.0003807353844384635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document +0.00038813507771983156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document +0.00040072346558408346 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document +0.0003603595180423597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document +0.00038799421353112465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document +0.00037575235582264926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document +0.0004239190342959713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document +0.0004606044799136546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document +0.00045107950652529253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document +0.0004391947201871058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document +0.0004457516661123035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document +0.0004301297170991686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document +0.00044661704164586694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document +0.0004438849846114837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document +0.0004444205734316823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document +0.0004190924165303394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document +0.00043942581131677875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document +0.00021568459798090663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document +0.0003814929225407199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document +0.0003217453179359235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document +0.00031719591470267974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document +0.00032434115726922137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document +0.0004079911120371051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document +0.000329492766381148 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document +0.0003845916162001633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document +0.0003835208964390098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document +0.00037847334157173194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document +0.00038296039903791865 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document +0.00037896336828472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document +0.00037620974396391355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document +0.00037420590727111843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document +0.000340490625886403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document +0.0003078314411035827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document +0.00034153990750656097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document +0.0003308858103982067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document +0.0003452640607156025 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document +0.00033095276418403455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document +0.0003116308995860414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document +0.00032446713226408477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document +0.0003015816821912984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document +0.00031612418775706894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document +0.0003278516344971041 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document +0.00033079446736097217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document +0.00032278977146550837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document +0.00032065272988207914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document +0.0003936696452406576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document +0.0003450109536627789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document +0.0003339787189919641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document +0.0003284303856176974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document +0.00033652677276843477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document +0.0003257822443845694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document +0.0003293985569149334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document +0.0003310360260148262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document +0.0003233770986418526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document +0.0003172280092149422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document +0.0003160674744292835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document +0.00030931090289598506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document +0.0003093173886443107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document +0.00033167847081104083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document +0.00031131501311729723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document +0.00031046608876279845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document +0.00030569235942207244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document +0.00030777943671285197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document +0.00029303314290956683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document +0.0003045824546400205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document +0.00030360880677729793 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document +0.00031646239964835433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document +0.0003129122300603785 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document +0.00031060464956661433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document +0.000311819032500067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document +0.0002977872483902282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document +0.0003009448600922438 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document +0.00028610292098537774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document +0.0002988326876216654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document +0.00028550828372819075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document +0.0002830381750875739 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document +0.0002848495855927156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document +0.0002856443760308144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document +0.00027442895344188584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document +0.0002681160554049462 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document +0.0003421482544126989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document +0.0004005872948449718 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document +0.0003930123959320308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document +0.0003867271832275778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document +0.000380805140455254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document +0.0003814769861947819 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document +0.00038025170883282324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document +0.0003738026647867475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document +0.00018960856915036276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document +0.0003697177501953134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document +0.00036674194328136693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document +0.00036447406838697555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document +0.00036686410861101255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document +0.00035915267825103423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document +0.0003624758404026675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document +0.0002822812140180794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document +0.00030620512946920813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document +0.000294249776520589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document +0.00030238536967523434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document +0.00029509593361580754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document +0.0002906912701830899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document +0.0002921944165474959 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document +0.00028358919691127954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document +0.0002813182772323272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document +0.00027442640800299205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document +0.0002747820342933984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document +0.0002747584403979717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document +0.00027499129634862444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document +0.0002712050404257197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document +0.0002616256943143254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document +0.00026769938929002815 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document +0.00038396081322727017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document +0.0003863140490027991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document +0.00037702277513203237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document +0.0003633274156107032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document +0.0003587473889240435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document +0.0003507672084278415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document +0.00033776425499780385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document +0.0003377914127574796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document +0.00032948015659161326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document +0.00033245638541392985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document +0.00031080707640648695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document +0.0002976903331149755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document +0.0002965121463725523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document +0.0002933849695266647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document +0.0002837035078508233 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document +0.00028684569079589323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document +0.0003145192320802359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document +0.0003566937253273515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document +0.0003470199109592918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document +0.0003060245312041868 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document +0.0002650817213818789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document +0.0002643604938780134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document +0.000299350876031416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document +0.0003178540797697938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document +0.000271850367887767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document +0.00031349896596549 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document +0.00031749734412765755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document +0.0003791137842391209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document +0.0003742334169957992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document +0.0003705639757351107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document +0.0003126986769797042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document +0.00031038132814561196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document +0.00036464437173804883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document +0.0003569480488951322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document +0.0003541239221619106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document +0.00035315297411308053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document +0.0003572451925404141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document +0.0003514986129411253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document +0.0003521798298425866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document +0.00034553677439244716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document +0.000349004719809412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document +0.0003468247484872769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document +0.0003465822608356558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document +0.00035410983132162007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document +0.0003487908354969444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document +0.0003479024763238147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document +0.000341412530646823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document +0.00034451316273667034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document +0.0002618849993484869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document +0.00026788679978901144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document +0.00027450670773227214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document +0.0002661273129899329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document +0.00026836569676402957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document +0.00026155876975483236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document +0.0002609276830117151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document +0.0002644161630512771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document +0.00036789208972872557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document +0.00037829849439990513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document +0.0003788894943523098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document +0.0003617207777959397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document +0.0002541334487248998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document +0.0002707945538071073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document +0.00027046282716455214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document +0.0002652443167243215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document +0.0002685859923850986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document +0.00025734961751176414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document +0.000259041720872915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document +0.00025340107274823446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document +0.00025757135121837893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document +0.00025617700500574084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document +0.0002566931670562857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document +0.0002543871190716101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document +0.00024997565589481713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document +0.0002954079779456287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document +0.00034890741135252835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document +0.0003473298137731525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document +0.0003296959618486435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document +0.0003304520061604598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document +0.00032377956175729824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document +0.00031700696295168713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document +0.0003060382346081943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document +0.0003012003005056863 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document +0.0002981074073993884 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document +0.0002922128825950705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document +0.000348901087722931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document +0.0003408286289467841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document +0.0003410649680770183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document +0.0003358524215576502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document +0.0003343661874989231 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document +0.00032810573699389156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document +0.00032261449539097497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document +0.0003162694866049203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document +0.0003158381156468853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document +0.000317376061083603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document +0.0003125788639953052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document +0.0003010105041885602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document +0.0003065865059090678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document +0.0003084275726508053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document +0.00030966560718296085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document +0.0002957728057853081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document +0.00029904164542325336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document +0.0002955358888729187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document +0.00028692976446931544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document +0.0002923476214935797 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document +0.0002893691697212419 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document +0.0002855895211981585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document +0.00027968347097626246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document +0.0002810783462604979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document +0.00027794080455729715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document +0.00034784376461416953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document +0.0003488347959010943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document +0.00034790583710250724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document +0.000345913166618151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document +0.00033801936268066675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document +0.0003290591130212315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document +0.00034051399521366823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document +0.00032470943131841784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document +0.00031679540050914276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document +0.00031814596342422325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document +0.0003156466289485036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document +0.00029985010879003633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document +0.0002905176377776361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document +0.0004206836775460856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document +0.00020660449162246918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document +0.0003461727254468087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document +0.00020592870907067763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document +0.00034173505299233005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document +0.0004052437256652738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document +0.0004080650901351697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document +0.00039778184149144276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document +0.00039046311464950275 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document +0.00039043444911071384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document +0.000388575704932843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document +0.00019737533145666597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document +0.00037610755595812403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document +0.00037315400127598317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document +0.00037415028580922163 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document +0.00036694041707212337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document +0.00018947219857306515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document +0.00037046050826533545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document +0.0003587440768559087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document +0.00034623936498708903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document +0.0003502289592617922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document +0.00034692398063649823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document +0.000339340809421849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document +0.0003360510394816983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document +0.0003354673850814145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document +0.00032937682875877047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document +0.00032844505049317715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document +0.00028287199339908627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document +0.0002795217197003578 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document +0.00028048955601883463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document +0.0002769326396439027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document +0.0002727090021299243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document +0.0002726577841024554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document +0.00026663619593455374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document +0.00026068042672138127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document +0.0002637704114326801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document +0.0002593043567100412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document +0.0002599897110113453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document +0.0002435078682758859 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document +0.0002450530071379054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document +0.00024233331983743606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document +0.0002934750947999535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document +0.00033241226364044474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document +0.00032938406090272075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document +0.00032778705403953246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document +0.00032184551480398754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document +0.00031874002264945737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document +0.0003165319685666433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document +0.00031307071173376295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document +0.00031119524184911957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document +0.0003102253344576429 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document +0.0003088976240383192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document +0.0002951410823077708 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document +0.00029772657676757413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document +0.0003056048989909935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document +0.00031991305381648026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document +0.00030890256978362426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document +0.0003109382904091933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document +0.00031035798529690644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document +0.00030741666395911753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document +0.0002989918594861846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document +0.00029569635443989434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document +0.0002973992445667285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document +0.000293397351001072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document +0.00028737817438047954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document +0.00028252738144009747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document +0.0002805511898623541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document +0.0003718020784620472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document +0.0003499713845765235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document +0.00034283547445326676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document +0.00031464759888838765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document +0.00033188946446414833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document +0.000326084432195463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document +0.0003764568303917893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document +0.0003604955598858414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document +0.0003655654554133222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document +0.00035762304033750504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document +0.00038478883950347103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document +0.00027735714341247454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document +0.00028139534607773563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document +0.00019777292251713763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document +0.000285571704874486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document +0.00028543482146244363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document +0.00019434234484256758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document +0.00027854908176986763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document +0.0002847068039566143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document +0.00028672356943064853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document +0.00027782687605808177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document +0.0002843539634105203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document +0.0002894748379090401 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document +0.0002868852440186493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document +0.0002818504885373851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document +0.00028680112812941034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document +0.00019258978168723977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document +0.00028760637934715155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document +0.0002820439443912918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document +0.0002831001054410018 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document +0.00029001901552467397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document +0.00027779449377883156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document +0.00019949837437516796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document +0.0002907306472984446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document +0.00027814858381318327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document +0.00019472790889161432 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document +0.00020472626596924125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document +0.0002870045081974301 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document +0.00019812241927078482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document +0.0002817553333369554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document +0.00027829782796642117 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document +0.00028289431732284113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document +0.0002795526296717729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document +0.00027682829988044574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document +0.0002895432402719184 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document +0.0002823174903941811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document +0.00028170972351837796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document +0.00027807915877838826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document +0.00028588515681452956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document +0.00028112324090816726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document +0.00020636178289985485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document +0.00019447255290980535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document +0.0002850824220591452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document +0.00027856429520116784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document +0.0002820880676635633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document +0.00028943902215995714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document +0.0002676366291085329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document +0.00023806333809954687 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document +0.00024526460430233455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document +0.00023876876664622726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document +0.00023379770334179805 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document +0.00024175151269138382 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document +0.00023386583242595706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document +0.00023771797150160827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document +0.0002262748967483896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document +0.0002408148346432682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document +0.00023398651720444235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document +0.00022989433874474592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document +0.00023948500543957772 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document +0.0002331594076859196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document +0.00023375132439600242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document +0.00023923410909668642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document +0.00023952796315562954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document +0.0002327466076905069 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document +0.00023082758956797212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document +0.0002240509275524448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document +0.00022798879995765268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document +0.000221172516774386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document +0.00021767045123534623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document +0.00021982832794804484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document +0.00021971626543789102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document +0.00022566565206920132 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document +0.0002181984894194856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document +0.00021831417549554653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document +0.00021601405421187145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document +0.00022275733725519607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document +0.00021847734911973986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document +0.0002243591012664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document +0.00021688758139483833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document +0.0002182953624789215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document +0.00020475155724026002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document +0.00021498078062960065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document +0.0002157914337233064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document +0.00021781838494967963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document +0.00021723242266814558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document +0.0002176782686553837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document +0.0003486179404943968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document +0.00034882846352857634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document +0.00031400868448352596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document +0.00030273484020011963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document +0.00029895889118145404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document +0.00029770764609621714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document +0.0002990181332116852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document +0.00029653733972285996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document +0.00029624649222942476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document +0.00029625609720203576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document +0.00029731928930852147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document +0.00029011721326148513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document +0.00028849788197494655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document +0.00021601278623858145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document +0.00021319599281739178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document +0.0002153325290600083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document +0.00018566946174516558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document +0.00020736824394291617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document +0.00020857419820128004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document +0.00020058526129536423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document +0.00020745812166665217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document +0.00020652171015271702 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document +0.00020643808911278608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document +0.00020040513914482103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document +0.00020598050188272898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document +0.0001969184139343296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document +0.0001972748812937012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document +0.0002038556751586195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document +0.00020245186011313464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document +0.00019950381422038783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document +0.00020837055459665258 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document +0.00020371856218246096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document +0.00019537612301625791 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document +0.00019914984508813857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document +0.0002053787713691309 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document +0.00019082100541008637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document +0.00020397153334531813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document +0.0002021462693077317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document +0.00019609357008124035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document +0.00019693256622486236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document +0.00020007239732428112 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document +0.00020467075741591954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document +0.00019584883400022932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document +0.00019135050391176972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document +0.0003362829834208298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document +0.00034013691154784095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document +0.00033215887031941976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document +0.00032681189065396707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document +0.0003149138485493094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document +0.00030179177307540077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document +0.0002923278437581119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document +0.00029470052278994486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document +0.0002994095093045731 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document +0.00029033525096085037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document +0.00029390798852496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document +0.0002916230924130842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document +0.00029419886374594913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document +0.0002865469756730764 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document +0.00021191292549942086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document +0.00021369664817409847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document +0.00021612485624266726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document +0.00022242192634588478 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document +0.00014605095659989698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document +0.00022070626106341693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document +0.0002174420774054071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document +0.00021325858963116995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document +0.0002124322999488052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document +0.0002081218896969054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document +0.0002108710211556957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document +0.00020686867095978426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document +0.00020895752681041895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document +0.00020741922266415738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document +0.0002069112657197308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document +0.00020644627473468118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document +0.00020332991338121604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document +0.0003560895677789848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document +0.00032915779111908214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document +0.00033810613317040864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document +0.00033729626594036923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document +0.00033550342864602944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document +0.00034173474024556906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document +0.000331505340748827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document +0.0003270050330117195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document +0.00032585275329172556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document +0.0003143383203190604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document +0.00031655199110388894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document +0.00030738872158476413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document +0.00030838388352699285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document +0.0003053596995351888 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document +0.00031836304739584593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document +0.000315315435873905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document +0.0003087116248965243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document +0.00030396790625537645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document +0.0003335812246032149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document +0.00034570956323095843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document +0.00034563035636675786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document +0.00033411265479076335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document +0.00034439191141692787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document +0.0003364483125496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document +0.0003299500453608033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document +0.00033163377700074837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document +0.00032638649660627673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document +0.00032616167939645234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document +0.0003205289298760723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document +0.00031939393740815355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document +0.00031593164066731296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document +0.00031928871111254405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document +0.00029670189073175004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document +0.00020517703846735904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document +0.00020128418186172073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document +0.00019662723895606717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document +0.0001981157042081407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document +0.00019703489037041608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document +0.00019079796331785068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document +0.0001909352306690079 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document +0.00018824662295261396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document +0.00019864275319325954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document +0.00018818516521649587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document +0.00018875694972812844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document +0.00018231621170645482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document +0.00018349407845798273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document +0.00018088971427746906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document +0.00018296284236327237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document +0.0001876011825819916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document +0.000329052068725176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document +0.00032223616273648536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document +0.00031272564089633955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document +0.00031621609908414494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document +0.0003117213560911235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document +0.00030218064069945934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document +0.00030658916600512085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document +0.0002915863534115821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document +0.0002940280138374372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document +0.00029067860468866085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document +0.00028529228063135635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document +0.00028336893301452256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document +0.0002794668089130099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document +0.00021681361378827842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document +0.0001484664674497246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document +0.00021950558378215133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document +0.00021806860758808645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document +0.00021819568718852282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document +0.00021626925931585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document +0.0001464536143077762 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document +0.00021432777088808917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document +0.000213473805865147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document +0.00021397067253964538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document +0.00020758957647437263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document +0.00020687124337683314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document +0.00020630057046511005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document +0.0002091166859352538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document +0.00020777355025615267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document +0.00020709287641496176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document +0.00020736464660577094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document +0.00020062246741862607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document +0.00020693207561942915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document +0.00021151004871893024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document +0.00019930249098689716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document +0.00021589710041231824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document +0.00021369204789905741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document +0.0002147099923936778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document +0.00021077531190389536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document +0.0002100509829113836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document +0.00021185362601571124 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document +0.00020722136637339565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document +0.00020300093701169531 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document +0.00019859737993313477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document +0.00019971314372100164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document +0.00019549908270269278 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document +0.00019649820843534028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document +0.00019619415513498067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document +0.00019493006120377898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document +0.00019499409035775506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document +0.00019252988593634277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document +0.00019440768268686405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document +0.00018747161324755577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document +0.0001879575932372779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document +0.00019040707058357506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document +0.0001871931095090703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document +0.00020112966223017096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document +0.00020516878165311017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document +0.00020664735191740533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document +0.00021041398572882962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document +0.00020397992929690396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document +0.0002039978580295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document +0.00020592785601142126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document +0.0001990755527445265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document +0.00019729564847798732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document +0.00019958182230527032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document +0.0001985037302636386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document +0.00020204130355115716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document +0.0002000296401958085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document +0.0001983064832295463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document +0.00019663108484195617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document +0.00019510678560556523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document +0.0001873284057063206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document +0.00019311553072495885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document +0.00034652137288816547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document +0.0002813690318850024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document +0.00027697649713138685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document +0.0002755419092534421 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document +0.0002681583054440219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document +0.00026945753192750824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document +0.00026169470768245737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document +0.00026437008960810825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document +0.0002637294838228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document +0.00026491867965088836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document +0.00025504483625138986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document +0.0002545040623796586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document +0.0002546682814073622 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document +0.00025545439487142615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document +0.0002626896557978271 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document +0.00025092040940402784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document +0.0002589154885863872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document +0.00024106160482721467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document +0.0002483289690087987 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document +0.0002388930282784437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document +0.00024006340759273874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document +0.00023765248178029045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document +0.00023061351965578936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document +0.00024954224883546477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document +0.00017861017233018525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document +0.00017810832743667658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document +0.00017599709170759497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document +0.00017462723516505223 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document +0.0002906316527068669 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document +0.00033762141066247166 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document +0.00017170670574152494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document +0.00017258674515137717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document +0.0002815386173173926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document +0.0002996845935618989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document +0.0002735268488987296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document +0.0002971738713071517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document +0.0002942690674002763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document +0.0003322222207729567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document +0.0003378721656198464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document +0.00018307262621851067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document +0.00033956081502775057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document +0.00031604820927876276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document +0.00028805657681088917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document +0.00026312293321215633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document +0.00034366936722921455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document +0.0002865256504406559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document +0.0003063615195861786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document +0.00028412791619666136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document +0.00028060835132727154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document +0.00032544974761560506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document +0.0002647177833217225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document +0.0003152621884896575 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document +0.0003054625140336913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document +0.00031183308312292263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document +0.00018175026696621178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document +0.00017699918328872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document +0.00018222339261441908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document +0.00018348005930964137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document +0.0001810735993810541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document +0.00030846441282038914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document +0.0002972326889310354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document +0.00017433421318235594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document +0.00032799458649525895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document +0.00032482130048512673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document +0.00031943465668672475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document +0.00029615593630484517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document +0.0002893126939511001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document +0.0002849288351723284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document +0.00028383906633569267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document +0.00028072526091262615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document +0.000284239564292377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document +0.0002778903109432523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document +0.0002771644389501471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document +0.0002733316182319337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document +0.00026362539185869363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document +0.0002636325383220217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document +0.00026740622442302886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document +0.0002646771971853427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document +0.0002628566720605389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document +0.0002644760695434766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document +0.0002623837702310999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document +0.00026088722976772894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document +0.0002567065374799158 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document +0.00018857382101207726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document +0.00019036580399817203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document +0.00018348828065261222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document +0.00018491851780345073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document +0.00018904887260080187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document +0.0001875609304251801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document +0.00018393034720015817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document +0.00018419795526114903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document +0.00018699955623404795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document +0.00018276256902965128 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document +0.00017698045695190812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document +0.00018104650132303642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document +0.00017758206731279688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document +0.00017131402995103497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document +0.000175944428350446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document +0.0003416745727147391 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document +0.0003163259373952889 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document +0.0002804489269172448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document +0.00028748272397403175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document +0.00027603318345630605 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document +0.000271638824679648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document +0.0002763761210210942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document +0.00026501984873172717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document +0.00026422486894694714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document +0.0002686339100849262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document +0.0002610837453940606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document +0.000260974343729353 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document +0.0002599403837029134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document +0.0002937273113238609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document +0.0003341790732600504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document +0.0002620661576600244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document +0.0003027929169239288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document +0.00031944039129326894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document +0.00019025676304139009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document +0.00018680910145009907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document +0.00034215840419416437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document +0.00018618120812119364 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document +0.00018605853095599425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document +0.00018120712626096538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document +0.00018315079292495327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document +0.00018362556449041974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document +0.0001780024456718171 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document +0.00033296526436178697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document +0.0001802398632282846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document +0.00017340263100798256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document +0.00017755840547238697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document +0.00018419413735260606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document +0.00017869518174591322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document +0.00017526271460129484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document +0.00017852168597981907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document +0.00017566536156787157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document +0.00017589867964432936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document +0.00017831487394075305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document +0.00017837310528935862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document +0.00018200908814216548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document +0.0001795136627511612 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document +0.0003414021775300033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document +0.00017177291787788502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document +0.0003441900648571877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document +0.0003394534597060673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document +0.0003236887233114832 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document +0.0001639544129688747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document +0.00019137443753211255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document +0.00018575146284680153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document +0.00019184792863440243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document +0.00018966043065679055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document +0.00017968851317035848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document +0.00018479881897661546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document +0.0001813642692683015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document +0.0001686449798983066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document +0.00018516104592230446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document +0.00031283726601066385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document +0.0003248607542883853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document +0.00031583241601202365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document +0.00031238270857730376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document +0.000307150592403979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document +0.00029443829986847044 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document +0.0002942723732234677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document +0.00023514930666443422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document +0.0020776328951453444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document +0.0021768234410538883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document +0.002106973549276289 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document +0.002110915756171751 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document +0.0017032382109816464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document +0.0019047944877712286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document +0.0019402711744016077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document +0.0006264790011223686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document +0.0017885401938106643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document +0.0003547982093445404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document +0.00035934014428504944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document +0.00035707704501371544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document +0.00035287930712815354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document +0.00035977166728996823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document +0.0003581675664109838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document +0.0003548617059697185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document +0.0003639582000286208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document +0.00035375839698688127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document +0.0003743722020080678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document +0.0003530399715341242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document +0.00035511875882752406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document +0.0003618733574783154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document +0.00035185243285420104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document +0.0003541503739732106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document +0.0003631679485751914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document +0.00035748045578182274 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document +0.0003606490690555877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document +0.0003626383296610091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document +0.00035442644361264756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document +0.00035978370170539796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document +0.0003585562375341541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document +0.0003601958372888019 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document +0.000350277765402227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document +0.0003616521184211704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document +0.0003620625543608188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document +0.0003560781983850704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document +0.0003553209610592676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document +0.00035905348643915075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document +0.00034744258805696526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document +0.00035462784035661496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document +0.00034768186175100895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document +0.0003568534635532736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document +0.00035586511544371234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document +0.0003524567827568137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document +0.0003512453770426313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document +0.0003591792726468799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document +0.0003514024529343127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document +0.0003584880112586934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document +0.00035133552916418045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document +0.0003600811981350215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document +0.0003571663974228119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document +0.00035768103378874214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document +0.00035939205561113694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document +0.00035186773916029825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document +0.0003542829672490847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document +0.0003592783642898726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document +0.0003556367340099302 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document +0.00035391392271377027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document +0.00035486725707484836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document +0.00034866743396828035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document +0.0003517219808644735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document +0.00034874458549673823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document +0.000355773136961014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document +0.00035611750387841917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document +0.00035305602013916315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document +0.0003578207127071924 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document +0.00035514635841943707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document +0.00034816946212866206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document +0.0003512707269761496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document +0.0003483392117980654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document +0.0003572169607204321 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document +0.00035139153281660794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document +0.00035536422129036537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document +0.000352017164107143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document +0.000351889550179365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document +0.000358759689953589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document +0.0003569286079869268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document +0.0003657752958602099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document +0.00035396127934790697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document +0.0003618565071224743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document +0.00035146051531973204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document +0.00036107135765783567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document +0.00035019554279994576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document +0.00035567858879904983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document +0.0003504753174793183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document +0.00035931140831329194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document +0.0003502967866002823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document +0.0003532911801041972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document +0.0003583543013070199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document +0.0003566243489931224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document +0.0003468752314799221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document +0.0003597840618138091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document +0.00035128822484768084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document +0.00035889496943437507 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document +0.000352400524650424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document +0.0003518689536768735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document +0.00035866864741303467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document +0.0003454687659106334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document +0.00035348007259317576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document +0.0003539752270940644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document +0.00035146495994081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document +0.00035397212846310423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document +0.00035208246467162587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document +0.0003490843168676626 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document +0.00035299633658644394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document +0.00034868327466167065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document +0.00035941351365601583 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document +0.0003545343062735255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document +0.0003528956380445978 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document +0.0003553355770443352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document +0.0003644224004937743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document +0.00035234291036216907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document +0.0003596237469847771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document +0.0003531996065735989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document +0.0003547177054106099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document +0.0003575586499260483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document +0.00035262635135283667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document +0.0003624191962188944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document +0.0003488398052948616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document +0.0003598294093147917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document +0.00035583006534466323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document +0.00035403139653225103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document +0.00036134702642187156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document +0.0003573689927162834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document +0.0003577141131435527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document +0.00035208814419277406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document +0.00035996720683665625 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document +0.00035415304658912596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document +0.00036353353029443546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document +0.0003537326003150983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document +0.00036053976358299083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document +0.000352380489373494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document +0.00036154661616900994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document +0.00035959332325963614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document +0.0003597954667189692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document +0.0003563108270597542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document +0.0003582891940460143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document +0.0003497728210484297 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document +0.0003549834902179354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document +0.0003529828233484542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document +0.00034627483903285777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document +0.00035569006572589215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document +0.00035449377946910314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document +0.00035802844396194623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document +0.0003617277809353208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document +0.00035034118898654814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document +0.000351091193908611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document +0.0003527914342210668 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document +0.00035028288369781376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document +0.00035775745592780506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document +0.0003449630690661468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document +0.0003583490698830361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document +0.0003476995746684122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document +0.0003535632505019212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document +0.00035640180641147417 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document +0.000361731045691765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document +0.0003534082129597368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document +0.0003550344149828664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document +0.00035363002411364057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document +0.0003537265579677396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document +0.00034950531383577937 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document +0.00035008511827347514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document +0.00035594533400871325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document +0.00035266312861335946 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document +0.00035280268794863923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document +0.0003565470391528536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document +0.0003588492322689137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document +0.00035469909697832775 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document +0.00034712082813410526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document +0.000348701157101807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document +0.0003500192014479944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document +0.00035120560544669755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document +0.00035403656850437445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document +0.00035852376560749366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document +0.0003534754068111774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document +0.00035591740046720765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document +0.000348522354782563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document +0.0003533533959664415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document +0.00035631425964030697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document +0.0003485886551574741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document +0.00035917652631065777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document +0.0003482975272111288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document +0.00035580661277480167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document +0.0003492290722955348 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document +0.00034989284450240613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document +0.0003545677216162781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document +0.00034622286859463484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document +0.00036070626989861965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document +0.00035518365036320786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document +0.00035272907057848406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document +0.0003547343638218734 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document +0.0003496450144966242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document +0.0003537407829294287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document +0.0003489722653985685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document +0.00035057186899911295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document +0.0003507566548933051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document +0.00035630360179023747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document +0.00035631362503416367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document +0.0003490204248026821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document +0.00035761724058371226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document +0.00035037664777467137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document +0.000353402110481068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document +0.00034524163568371745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document +0.00035528523728570974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document +0.00034784916132431703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document +0.00034928476408048925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document +0.00034989205973784984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document +0.00034201664404094254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document +0.0003529676016338611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document +0.00034643433682346637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document +0.0003511666373001904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document +0.00034828669066575333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document +0.0003494625207264413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document +0.0003458957535879216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document +0.0003543020478990003 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document +0.00034754384069014956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document +0.0003598856392240133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document +0.0003503335458553846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document +0.00035919595619778716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document +0.00035767737970754404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document +0.00035197152783998165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document +0.0003549609834422404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document +0.0003568184100569753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document +0.0003512652818651935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document +0.00035912648958665754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document +0.00034764526964056546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document +0.000352439784960359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document +0.00035295886560764226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document +0.0003518132693658672 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document +0.00035589987915465713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document +0.00034923863317385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document +0.0003457987267929692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document +0.0003560928663480501 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document +0.0003529603811204932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document +0.0003524438555443043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document +0.0003438847030263783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document +0.00035981978898461613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document +0.0003446342778566972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document +0.00035529584995236537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document +0.00034855740895831116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document +0.00034932634912802544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document +0.00035805518303064666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document +0.0003497941877073061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document +0.00035774398685405447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document +0.0003560421780316607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document +0.0003508844468369392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document +0.00035731928892270107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document +0.0003557884626314314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document +0.00034992996760289355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document +0.000360752554360921 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document +0.0003452321668708545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document +0.0003591745226131023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document +0.00035256981433229084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document +0.00035378123159712034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document +0.000350464354895999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document +0.00035074625557389677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document +0.00035025894701994667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document +0.00035437902514857614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document +0.0003514684519732232 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document +0.00035449717909633905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document +0.0003436816402714221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document +0.00035139158071782116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document +0.0003509424079843335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document +0.000343894618577506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document +0.0003500789770661659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document +0.0003407788080680086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document +0.0003581908175239701 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document +0.0003465541618780918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document +0.00034600228792437736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document +0.00034416738982773204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document +0.0003519900340150641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document +0.000343369616864659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document +0.0003544993883274688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document +0.0003504441365073392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document +0.00034859160702727056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document +0.00035355909532647185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document +0.0003471900922691849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document +0.0003563015508709187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document +0.0003487888744148821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document +0.00034711767548688336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document +0.0003530734609369085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document +0.00035123969242560935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document +0.0003517127620891489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document +0.00035232835416868673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document +0.0003524437481912308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document +0.0003525996167005602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document +0.00035064770545242043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document +0.00035311558274981226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document +0.00034952204800569914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document +0.0003541471367344846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document +0.00035418812454561825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document +0.0003528951372900714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document +0.0003542338042975688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document +0.00034937738939942796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document +0.0003522182190878447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document +0.0003501406466507449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document +0.00034973079877492633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document +0.0003485274567713538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document +0.00034999308679368985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document +0.0003570051724707296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document +0.00034567230462019706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document +0.00035529000940160696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document +0.00034956512308671755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document +0.0003496962834028953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document +0.0003468745282493457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document +0.0003502717155809202 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document +0.0003556240880896514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document +0.0003515109488424343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document +0.0003563156688192592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document +0.00035040277363989817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document +0.0003481408593290717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document +0.0003624575124332874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document +0.0003522684124250313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document +0.00035286996027653544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document +0.00034967623997256725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document +0.00035182649587602765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document +0.0003524892557026489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document +0.0003507642477451811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document +0.00036190408389835666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document +0.00035102739424880766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document +0.00035239718753257265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document +0.00035298076121821316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document +0.0003478704389752654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document +0.0003503109191567942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document +0.00035143250975654426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document +0.0003480663923069012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document +0.00035691540219998623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document +0.000348815437166351 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document +0.00035202073257766225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document +0.0003491569096274706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document +0.00035277390475511834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document +0.0003524972090026609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document +0.0003504854249750236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document +0.00034740238025423914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document +0.00034968015462277606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document +0.0003493798632762674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document +0.0003488202537862122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document +0.0003525461864643725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document +0.00034903815232825664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document +0.00035536982539258216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document +0.00034858083265155483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document +0.0003505014973608067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document +0.00035327984042622104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document +0.0003503286677453136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document +0.00035835274842442816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document +0.00034970302660275595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document +0.000357929573140149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document +0.0003517238649788585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document +0.00036097027318848475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document +0.0003502734074110026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document +0.00035801510806036273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document +0.0003568006373479869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document +0.00036128108717454636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document +0.0003563436883111686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document +0.00035559725321852463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document +0.00035089656006854944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document +0.000359453964362057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document +0.00035629498059104033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document +0.0003622207707090437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document +0.0003540946784512821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document +0.0003594750565232011 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document +0.0003566007415086991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document +0.0003562142599126134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document +0.0003569948186744601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document +0.00035166554847920186 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document +0.00035047994419295137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document +0.0003561578193739437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document +0.00035470866838811544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document +0.00034216920464876335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document +0.0003550021513075795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document +0.0003488045105938729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document +0.0003513340720840151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document +0.0003448558566387584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document +0.0003460966026953241 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document +0.0003488157616036459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document +0.0003446120387842362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document +0.000351528602987427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document +0.00035661118227454713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document +0.0003551342699877457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document +0.0003478953397924445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document +0.00034625782458988215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document +0.0003527515447405871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document +0.00034823744889805696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document +0.00034823314560254406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document +0.00035162668292961944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document +0.0003477307716074623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document +0.0003446457989477787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document +0.00034782916273767795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document +0.0003517249130302248 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document +0.0003449873430908556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document +0.00034841291749669877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document +0.0003466028498941749 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document +0.0003486436831199424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document +0.0003478279234211838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document +0.0003495903653274374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document +0.00034896893881218957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document +0.000348941645312426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document +0.0003474221308416894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document +0.0003462621543839385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document +0.0003669373860863891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document +0.00034691156268163006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document +0.0003527774103765281 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document +0.00034684565672734663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document +0.0003454250599604457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document +0.0003541536557159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document +0.000345735737037366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document +0.0003524669816385214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document +0.0003441817133096468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document +0.0003519093265859089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document +0.00035080085480352095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document +0.00035285227929327434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document +0.00034354836346901676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document +0.00034789770937373467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document +0.000343665920520102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document +0.0003490884931060568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document +0.00034380029463398654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document +0.00034874768005099945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document +0.0003457058510967673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document +0.00034644265227023904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document +0.00035008339858594957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document +0.0003462377193296194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document +0.0003620491787114201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document +0.000348717011044469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document +0.00034370072363913706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document +0.0003551981066775649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document +0.0003500119496799342 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document +0.0003485082952669081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document +0.0003508155580978919 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document +0.00035311375163251416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document +0.00034945972003423253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document +0.0003474220353789879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document +0.0003536443686585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document +0.0003560350489042953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document +0.0003493655927914396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document +0.0003528423977146383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document +0.00035255554724471217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document +0.0003479760010190111 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document +0.00035458598862501956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document +0.0003458990560538315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document +0.00035157946422379875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document +0.00034736860650169996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document +0.0003529152313394119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document +0.00034586294329524465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document +0.00035707214923794877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document +0.0003509580363496512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document +0.00035244176725524474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document +0.0003467539557999047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document +0.00034919687962275546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document +0.00035094031731719953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document +0.0003484309008351352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document +0.0003485409424916253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document +0.0003499590776117838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document +0.0003492842758957848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document +0.0003529712275178912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document +0.0003566141287087449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document +0.0003649496522047409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document +0.0003563218912208234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document +0.00035614782126966145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document +0.0003531944298453266 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document +0.0003535950949566616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document +0.0003544295554928795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document +0.0003519908503740376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document +0.00035752817626134463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document +0.0003515322689589972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document +0.0003486893890307115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document +0.0003446520464889867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document +0.0003509421562481707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document +0.00035335015702909084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document +0.0003490178167345008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document +0.0003520497821155174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document +0.0003549762618908944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document +0.00035072190850833103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document +0.0003542458638526423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document +0.000352419194572916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document +0.0003545102564672614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document +0.0003495437992331806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document +0.0003542843376993964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document +0.000352827529313958 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document +0.00035442506093223886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document +0.0003496970719044257 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document +0.0003553096424442362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document +0.00034986845565067564 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document +0.000352131055186658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document +0.0003527021708198983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document +0.00034905885414547214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document +0.0003583433842468394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document +0.00034409435202828383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document +0.00034846410520871483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document +0.0003554459991927314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document +0.00035310507471843076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document +0.000350028910786098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document +0.00035049727458009896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document +0.0003519047735925826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document +0.0003513027429919726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document +0.0003626947260354396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document +0.0003500087324849783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document +0.0003618315726725285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document +0.0003535385113938023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document +0.0003487064058517615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document +0.0003618709124780938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document +0.00035040070335625915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document +0.0003506279032267829 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document +0.0003498435310527524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document +0.0003554634749821431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document +0.00035091209738758963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document +0.00035034103678978573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document +0.00035398931854386146 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document +0.00035495529304989485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document +0.00036067883473356603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document +6.322825248625475e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document +2.4432314037946264e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document +5.6313888721313454e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document +2.4208171781595055e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document +2.325811856369237e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document +2.4010790356322705e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document +5.36773610843632e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document +1.360574433501002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document +1.3076540344853244e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document +1.3386534334886313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document +1.2498103719605153e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document +1.403763836949682e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document +1.3636756723495417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document +1.2242489446940814e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document +1.2398255818973339e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document +1.2972616994216281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document +1.3947809855914134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document +1.3144843787829514e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document +1.1693809976572487e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document +1.3677252682893802e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document +1.3940876719849597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document +1.4222245138730965e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document +1.3201677767919704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document +1.1421717796486169e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document +1.2890514724498703e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document +1.3649507648749037e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document +1.2400732563490717e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document +1.1557681453277616e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document +1.2294483595964517e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document +1.2137484472122283e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document +1.3299663426456e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document +1.2461984216479532e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document +1.4666434217609636e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document +1.1876997894686238e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document +1.2939155338964078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document +1.3859590039728515e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document +1.317917848615668e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document +1.1335281536110342e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document +1.2889923952861426e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document +1.3471671647053326e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document +1.2221720014475102e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document +1.2632647276287541e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document +1.28276219004076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document +1.36213704321643e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document +1.2414858625261553e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document +1.3173700421883744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document +1.295597796725686e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document +1.242783936442904e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document +1.2417374088427464e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document +1.2134479405400744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document +1.3090040663304255e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document +1.2713470581614905e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document +5.5750231378906594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document +5.777597358425469e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document +5.349786767471258e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document +5.675165050453583e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document +5.482611216158831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document +5.065421899890121e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document +5.384718357480146e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document +4.872037363236061e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document +4.532709250783155e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document +5.7257963030489613e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document +4.9014365579652036e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document +5.722863552770969e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document +6.149911636146833e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document +5.2178057608273506e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document +4.990228161160431e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document +5.866186875255134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document +5.004185734360719e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document +4.79401853705107e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document +5.435219965052376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document +5.035997225792266e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document +5.622401774211625e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document +5.028826157387559e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document +5.596379470128795e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document +6.027824493191489e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document +5.5358270009931474e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document +5.9839051807685496e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document +5.1221077499249595e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document +5.517228560620279e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document +5.1687858285052305e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document +5.684188244145645e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document +5.212693275535878e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document +4.8551007022784084e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document +5.4888506639203145e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document +5.345098688527242e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document +4.8506420625516594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document +5.132168603397676e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document +5.719476795114223e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document +5.7448621149792696e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document +4.9068410568059265e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document +5.382937299647678e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document +4.8288432136304634e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document +5.841703200305416e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document +5.1589611587885584e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document +6.031113829732574e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document +5.4558202844532094e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document +5.341852317196142e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document +5.1402942738369954e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document +5.735421384377395e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document +5.473629863586958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document +5.4708993245733936e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document +4.931161863634078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document +5.104173022127248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document +5.510157161510824e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document +5.652501401782597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document +5.7273656573031666e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document +5.638363224821738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document +5.6128115396668704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document +5.00304877998141e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document +5.596120554779096e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document +5.5280923889040006e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document +5.223477917938408e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document +5.29472809986569e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document +2.205682378243213e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document +1.4367563720603185e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document +3.5506193487931076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document +3.0442910855821778e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document +2.2540042508019627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document +2.6880163202623216e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document +2.534473148048727e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document +2.6560945431318916e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document +2.547470248967691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document +2.5248825388073738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document +2.5828729575000054e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document +2.4026583817957736e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document +2.3930425429834413e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document +2.5037365362599724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document +2.6696745470595603e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document +2.140323051341762e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document +2.617354786691592e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document +1.538359101762691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document +1.2871029252377856e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document +2.255195411289217e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document +2.4832313897952067e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document +9.303873918189968e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document +2.179532302620228e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document +1.9750517506901206e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document +2.7740420380648435e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document +2.7813714782319335e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document +4.1595357937609806e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document +2.741365122389175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document +2.117451071361901e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document +1.7132649760565998e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document +1.7492547092602047e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document +1.7499951097392276e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document +1.6632444789170958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document +1.6678802252361607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document +1.5519208704558896e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document +1.652420992967167e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document +1.6119931034508755e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document +1.6638882076736552e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document +1.7198076782652946e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document +1.572927860565175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document +1.5194822618169918e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document +1.6677776832669846e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document +1.595612492245688e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document +1.682350633181197e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document +1.663983380609724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document +1.710187842689243e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document +1.5733697527539038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document +1.6972104757911438e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document +1.6610142847616577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document +1.61094882403031e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document +1.4789207305138325e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document +1.639299617676302e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document +1.3241204512116132e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document +8.582260726625535e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document +8.213000975576739e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document +9.549247732811947e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document +9.17242785339013e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document +7.632868223725218e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document +8.674401118222175e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document +9.124384255505347e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document +8.344222222417358e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document +8.992299957499065e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document +8.76689497361025e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document +7.973396239586015e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document +9.006935606644125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document +8.725545954955498e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document +1.215449694669174e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document +3.3041720284158646e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document +2.0593512412624502e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document +1.893608946986248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document +1.737111666788535e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document +1.4915923449873955e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document +2.289370239067605e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document +2.8615335689614638e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document +8.847283630883125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document +1.8175470362373804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document +1.8152226683368038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document +1.789149655314284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document +1.7690523036477663e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document +1.8333732213753644e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document +1.8794105687718654e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document +1.721841156706417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document +2.0612008685724796e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document +1.9297370681336376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document +2.0188440409661018e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document +5.1741216329695265e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document +1.3417913926038429e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document +1.1010813016469651e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document +1.1252416134320087e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document +1.2801744104313002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document +1.3041514955795817e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document +1.3428837580879075e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document +1.320809382267804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document +1.3451566676555968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document +1.228284926657501e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document +1.2410599573923043e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document +1.3815343367377182e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document +1.3895126265148832e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document +1.2306773644401741e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document +1.32981021906281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document +1.101337469221607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document +1.513094184404692e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document +1.1073759547073234e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document +1.2879348765857567e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document +9.619595770228435e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document +1.2384340836286436e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document +1.1766667232211577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document +1.2871049236196452e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document +1.2010645926497744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document +1.3971428231518597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document +1.2283733550547932e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document +1.2659530508255308e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document +1.551775613074462e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document +1.1169413343776979e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document +1.1433700593712463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document +4.964773647323492e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document +1.0995586595687313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document +1.2957393071411267e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document +2.75899247407709e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document +2.8269344597344854e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document +2.329108187246831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document +2.4231761430460284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document +1.2434140512230442e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document +1.638718338352859e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document +3.272953556801187e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document +6.061314500486327e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document +1.2465979731210292e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document +1.2737557327967737e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document +1.038428658075627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document +2.61666472045566e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document +3.6506873212272224e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document +1.5066359138295701e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document +1.1166290872121178e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document +1.5546966228590285e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document +1.2583434625014828e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document +1.3398826881300862e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document +1.2944933160515968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document +1.0971437399901365e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document +1.2787922795775774e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document +1.404979227816985e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document +1.3344734431324463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document +4.886031157107555e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document +3.277261443596394e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document +3.5057957685786495e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document +3.287625301718589e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document +3.1370056372668855e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document +3.186092015785841e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document +7.271819324142512e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document +0.001451215788905126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document +0.0014486847196258788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document +0.0008861032722895899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document +0.0018119590809459816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document +0.0008916937917547129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document +6.960128832809415e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document +0.002008403651063623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document +0.0014374900742131454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document +0.00180213596996716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document +0.001956178877532413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document +0.0008829547017667033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document +0.0008910853619157279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document +0.0018260998845299973 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document +0.0012499632072059553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document +0.00125398260359913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document +0.0012541704774729071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document +0.0012527268234360602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document +0.0012532925243737164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document +0.0012456396241204315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document +0.0012589894424352072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document +0.001508020123999618 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document +0.00333096950781965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document +0.0033233414614415547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document +0.003512387990689828 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document +0.0035091382940513126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document +0.003514155927147005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document +0.003327108000579638 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document +0.003329106196589836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document +0.003505604148738077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document +0.003324825759567855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document +0.0033248240149804913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document +0.0033385962112851358 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document +0.0035043186296553615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document +0.003340469505431529 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document +0.0035106889084796276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document +0.0033309469281030167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document +0.003340337858029757 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document +0.003505919861097801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document +0.0003882924098240512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document +0.0005759963691850877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document +0.0005959971675332674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document +0.0006026179290353799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document +0.0005824184320784846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document +0.0005854598548616037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document +0.0005903767055633473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document +0.0005930306490982049 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document +0.000569425602700746 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document +0.0005675060415179408 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document +0.0005772431621253389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document +0.0005678026053826858 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document +0.0005700398263483378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document +0.0005669467963528824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document +0.0005701015953324305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document +0.0005795907287413296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document +0.0005735602737531164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document +0.0005749862745842101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document +0.0005693257015931971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document +0.0005716568794795563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document +0.0005761083919774021 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document +0.0005688343169797355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document +0.0005807913190929842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document +0.0005710229258078636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document +0.0005704083039826862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document +0.0005862132348308056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document +0.0005717662049559556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document +0.0005858155213694451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document +0.0005812012281792392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document +0.0005803981414588498 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document +0.0005700102108287723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document +0.0005719243459052329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document +0.0005867253401661752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document +0.0005731087218860733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document +0.0005712197789109317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document +0.0005702376926310089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document +0.0005700411527742972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document +0.0005828090098178196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document +0.0005770140826168056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document +0.0005723509664597896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document +0.0005755499231836962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document +0.0005636407438471367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document +0.0005640281556500104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document +0.0005633159058766496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document +0.0005638034311151449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document +0.0005630066273073224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document +0.0005631803831128559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document +0.0005631228881679657 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document +0.0005628178701487633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document +0.0005624448092256196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document +0.0005620957024062329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document +0.0005614201504177484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document +0.0005616890951464056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document +0.0005611348559279058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document +0.0005604238061828518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document +0.0005603301490194237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document +0.0005607291294548833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document +0.0005605234569930727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document +0.0005613778566640694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document +0.0005610248539992471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document +0.0005599977416780475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document +0.0005603632562116935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document +0.0005599177479509897 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document +0.0005595202318298379 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document +0.0005600975633499175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document +0.0005614075491213365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document +0.000612563885043477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document +0.0005515469909644413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document +0.0005526782014946906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document +0.0005472463408095445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document +0.0005502284746004587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document +0.0005414514790555363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document +0.0005513499500134784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document +0.0005391391454105187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document +0.0005415836910001838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document +0.0005208132468536551 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document +0.0005889827143132871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document +0.0005822520817765276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document +0.0004173155230758696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document +0.0009994361338078242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document +0.001087156194657966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document +0.0010667737163656816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document +0.0009602877882124873 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document +0.0008968956271971105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document +0.0009198034843762967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document +0.0009423901016715341 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document +0.0009674094553686345 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document +0.0009858331322519164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document +0.0009970593645879198 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document +0.0010027035193731686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document +0.0010128291154221853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document +0.0010215631382631918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document +0.0010288663771461238 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document +0.0010346219929285867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document +0.00104544019940344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document +0.0010525172676724333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document +0.0010609529620775127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document +0.0010725892748610153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document +0.0010818563598181568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document +0.0010992760196793917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document +0.0011178992762079917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document +0.001124687532085676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document +0.001118303661267191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document +0.0010206825575416534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document +0.0005512280117499715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document +0.004474659408857016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document +0.00409944473890653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document +0.005137179939941845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document +0.005143172251066109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document +0.005206134363352808 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document +0.004892747858974329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document +0.004844731352552902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document +0.005308320169123755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document +0.005124709815666577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document +0.005424710744483826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document +0.00538244648861977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document +0.0029107284679086853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document +0.0026825258998444705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document +0.0026904503191419243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document +0.002687906577174073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document +0.002850165346048818 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document +0.005322698571717847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document +0.004450334290869719 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document +0.004700990083440683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document +0.003903568556500995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document +0.00390561515396931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document +0.0039046402900912262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document +0.003907454839379547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document +0.0038583224578603824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document +0.0037914116657695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document +0.003786665266798682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document +0.003792000802430658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document +0.00319266847466091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document +0.0032658716699838944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document +0.0034801959532460023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document +0.0028307012092022594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document +0.0028420360878146276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document +0.0028410455248484914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document +0.00283497183526842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document +0.002840187195459487 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document +0.0028398709431369834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document +0.004364722843422023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document +0.004093255713117101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document +0.004092331079566252 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document +0.004005326985579649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document +0.0036205502856964207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document +0.003625316793034984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document +0.003604743435602363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document +0.0035405823343673125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document +0.0041601413517253945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document +0.005886303658937057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document +0.003600909532810332 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document +0.0034941365817168658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document +0.0004992164842980224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document +0.00032927705604725614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document +0.0002860154190878753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document +0.0002845217585425619 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document +0.0002743528685497456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document +0.00026025323737738766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document +0.00023493876414603155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document +0.00029665994994226705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document +0.00031808102075993956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document +0.00031813573046011285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document +0.0002711905171855542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document +0.00028892513401817095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document +0.00030003908676979083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document +0.00026839878771944684 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document +0.00029155935002690497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document +0.0002998624927624209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document +0.0003091705447974841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document +0.00026873195794309786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document +0.00027721873498527547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document +0.0002841662554024377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document +0.0002839461156551537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document +0.0002861705604659811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document +0.0002460995649635886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document +0.00019420142619795496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document +0.00021967677816173628 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document +0.0002620283200480949 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document +0.0002433390542188936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document +0.00021254976608350767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document +0.00022094815569522115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document +0.000342862378668244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document +0.00033784225259118157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document +0.0003367278459543952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document +0.00029843279042852765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document +0.0002926583661257988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document +0.00029320337282010673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document +0.00029281450669483455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document +0.0002915338187002653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document +0.0002864226923084572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document +0.00028643439083586396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document +0.00028253710956299054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document +0.0002810856078805806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document +0.00031474941344656715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document +0.0002139130222205655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document +0.0003084648871862831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document +0.0003309477872140129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document +0.0003360096824695161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document +0.0003355452655196557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document +0.00038119390366386037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document +0.00038078927630086064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document +0.0003386200917551554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document +0.0002158905159938882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document +0.00021621682877018768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document +0.00021553306942740535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document +0.00021581563462722296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document +0.0002157694110556169 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document +0.000215643699847159 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document +0.00021532716715168094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document +0.00021531221326022472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document +0.0002831801179028896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document +0.0002514844936507595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document +0.00031638782778107964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document +0.0002749197545278445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document +0.00026159721512464495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document +0.0002630052420096968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document +0.00031106811228913666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document +0.0002852973415334161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document +3.7555372465932136e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document +0.003548077173506675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document +0.0018372203137874265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document diff --git a/ALCF/data-lists/polaris/falcon.txt b/ALCF/data-lists/polaris/falcon.txt new file mode 100644 index 0000000000..68aeb2f27b --- /dev/null +++ b/ALCF/data-lists/polaris/falcon.txt @@ -0,0 +1,501 @@ +0.0003547982093445404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document +0.00035934014428504944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document +0.00035707704501371544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document +0.00035287930712815354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document +0.00035977166728996823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document +0.0003581675664109838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document +0.0003548617059697185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document +0.0003639582000286208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document +0.00035375839698688127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document +0.0003743722020080678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document +0.0003530399715341242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document +0.00035511875882752406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document +0.0003618733574783154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document +0.00035185243285420104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document +0.0003541503739732106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document +0.0003631679485751914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document +0.00035748045578182274 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document +0.0003606490690555877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document +0.0003626383296610091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document +0.00035442644361264756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document +0.00035978370170539796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document +0.0003585562375341541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document +0.0003601958372888019 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document +0.000350277765402227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document +0.0003616521184211704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document +0.0003620625543608188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document +0.0003560781983850704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document +0.0003553209610592676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document +0.00035905348643915075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document +0.00034744258805696526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document +0.00035462784035661496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document +0.00034768186175100895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document +0.0003568534635532736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document +0.00035586511544371234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document +0.0003524567827568137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document +0.0003512453770426313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document +0.0003591792726468799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document +0.0003514024529343127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document +0.0003584880112586934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document +0.00035133552916418045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document +0.0003600811981350215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document +0.0003571663974228119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document +0.00035768103378874214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document +0.00035939205561113694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document +0.00035186773916029825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document +0.0003542829672490847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document +0.0003592783642898726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document +0.0003556367340099302 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document +0.00035391392271377027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document +0.00035486725707484836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document +0.00034866743396828035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document +0.0003517219808644735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document +0.00034874458549673823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document +0.000355773136961014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document +0.00035611750387841917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document +0.00035305602013916315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document +0.0003578207127071924 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document +0.00035514635841943707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document +0.00034816946212866206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document +0.0003512707269761496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document +0.0003483392117980654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document +0.0003572169607204321 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document +0.00035139153281660794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document +0.00035536422129036537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document +0.000352017164107143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document +0.000351889550179365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document +0.000358759689953589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document +0.0003569286079869268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document +0.0003657752958602099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document +0.00035396127934790697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document +0.0003618565071224743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document +0.00035146051531973204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document +0.00036107135765783567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document +0.00035019554279994576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document +0.00035567858879904983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document +0.0003504753174793183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document +0.00035931140831329194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document +0.0003502967866002823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document +0.0003532911801041972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document +0.0003583543013070199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document +0.0003566243489931224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document +0.0003468752314799221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document +0.0003597840618138091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document +0.00035128822484768084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document +0.00035889496943437507 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document +0.000352400524650424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document +0.0003518689536768735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document +0.00035866864741303467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document +0.0003454687659106334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document +0.00035348007259317576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document +0.0003539752270940644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document +0.00035146495994081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document +0.00035397212846310423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document +0.00035208246467162587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document +0.0003490843168676626 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document +0.00035299633658644394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document +0.00034868327466167065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document +0.00035941351365601583 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document +0.0003545343062735255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document +0.0003528956380445978 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document +0.0003553355770443352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document +0.0003644224004937743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document +0.00035234291036216907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document +0.0003596237469847771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document +0.0003531996065735989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document +0.0003547177054106099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document +0.0003575586499260483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document +0.00035262635135283667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document +0.0003624191962188944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document +0.0003488398052948616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document +0.0003598294093147917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document +0.00035583006534466323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document +0.00035403139653225103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document +0.00036134702642187156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document +0.0003573689927162834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document +0.0003577141131435527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document +0.00035208814419277406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document +0.00035996720683665625 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document +0.00035415304658912596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document +0.00036353353029443546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document +0.0003537326003150983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document +0.00036053976358299083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document +0.000352380489373494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document +0.00036154661616900994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document +0.00035959332325963614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document +0.0003597954667189692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document +0.0003563108270597542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document +0.0003582891940460143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document +0.0003497728210484297 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document +0.0003549834902179354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document +0.0003529828233484542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document +0.00034627483903285777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document +0.00035569006572589215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document +0.00035449377946910314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document +0.00035802844396194623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document +0.0003617277809353208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document +0.00035034118898654814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document +0.000351091193908611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document +0.0003527914342210668 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document +0.00035028288369781376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document +0.00035775745592780506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document +0.0003449630690661468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document +0.0003583490698830361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document +0.0003476995746684122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document +0.0003535632505019212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document +0.00035640180641147417 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document +0.000361731045691765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document +0.0003534082129597368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document +0.0003550344149828664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document +0.00035363002411364057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document +0.0003537265579677396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document +0.00034950531383577937 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document +0.00035008511827347514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document +0.00035594533400871325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document +0.00035266312861335946 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document +0.00035280268794863923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document +0.0003565470391528536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document +0.0003588492322689137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document +0.00035469909697832775 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document +0.00034712082813410526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document +0.000348701157101807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document +0.0003500192014479944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document +0.00035120560544669755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document +0.00035403656850437445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document +0.00035852376560749366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document +0.0003534754068111774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document +0.00035591740046720765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document +0.000348522354782563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document +0.0003533533959664415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document +0.00035631425964030697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document +0.0003485886551574741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document +0.00035917652631065777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document +0.0003482975272111288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document +0.00035580661277480167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document +0.0003492290722955348 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document +0.00034989284450240613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document +0.0003545677216162781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document +0.00034622286859463484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document +0.00036070626989861965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document +0.00035518365036320786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document +0.00035272907057848406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document +0.0003547343638218734 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document +0.0003496450144966242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document +0.0003537407829294287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document +0.0003489722653985685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document +0.00035057186899911295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document +0.0003507566548933051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document +0.00035630360179023747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document +0.00035631362503416367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document +0.0003490204248026821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document +0.00035761724058371226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document +0.00035037664777467137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document +0.000353402110481068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document +0.00034524163568371745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document +0.00035528523728570974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document +0.00034784916132431703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document +0.00034928476408048925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document +0.00034989205973784984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document +0.00034201664404094254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document +0.0003529676016338611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document +0.00034643433682346637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document +0.0003511666373001904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document +0.00034828669066575333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document +0.0003494625207264413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document +0.0003458957535879216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document +0.0003543020478990003 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document +0.00034754384069014956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document +0.0003598856392240133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document +0.0003503335458553846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document +0.00035919595619778716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document +0.00035767737970754404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document +0.00035197152783998165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document +0.0003549609834422404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document +0.0003568184100569753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document +0.0003512652818651935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document +0.00035912648958665754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document +0.00034764526964056546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document +0.000352439784960359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document +0.00035295886560764226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document +0.0003518132693658672 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document +0.00035589987915465713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document +0.00034923863317385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document +0.0003457987267929692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document +0.0003560928663480501 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document +0.0003529603811204932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document +0.0003524438555443043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document +0.0003438847030263783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document +0.00035981978898461613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document +0.0003446342778566972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document +0.00035529584995236537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document +0.00034855740895831116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document +0.00034932634912802544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document +0.00035805518303064666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document +0.0003497941877073061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document +0.00035774398685405447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document +0.0003560421780316607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document +0.0003508844468369392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document +0.00035731928892270107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document +0.0003557884626314314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document +0.00034992996760289355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document +0.000360752554360921 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document +0.0003452321668708545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document +0.0003591745226131023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document +0.00035256981433229084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document +0.00035378123159712034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document +0.000350464354895999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document +0.00035074625557389677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document +0.00035025894701994667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document +0.00035437902514857614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document +0.0003514684519732232 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document +0.00035449717909633905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document +0.0003436816402714221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document +0.00035139158071782116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document +0.0003509424079843335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document +0.000343894618577506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document +0.0003500789770661659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document +0.0003407788080680086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document +0.0003581908175239701 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document +0.0003465541618780918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document +0.00034600228792437736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document +0.00034416738982773204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document +0.0003519900340150641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document +0.000343369616864659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document +0.0003544993883274688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document +0.0003504441365073392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document +0.00034859160702727056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document +0.00035355909532647185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document +0.0003471900922691849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document +0.0003563015508709187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document +0.0003487888744148821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document +0.00034711767548688336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document +0.0003530734609369085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document +0.00035123969242560935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document +0.0003517127620891489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document +0.00035232835416868673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document +0.0003524437481912308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document +0.0003525996167005602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document +0.00035064770545242043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document +0.00035311558274981226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document +0.00034952204800569914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document +0.0003541471367344846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document +0.00035418812454561825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document +0.0003528951372900714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document +0.0003542338042975688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document +0.00034937738939942796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document +0.0003522182190878447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document +0.0003501406466507449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document +0.00034973079877492633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document +0.0003485274567713538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document +0.00034999308679368985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document +0.0003570051724707296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document +0.00034567230462019706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document +0.00035529000940160696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document +0.00034956512308671755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document +0.0003496962834028953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document +0.0003468745282493457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document +0.0003502717155809202 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document +0.0003556240880896514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document +0.0003515109488424343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document +0.0003563156688192592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document +0.00035040277363989817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document +0.0003481408593290717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document +0.0003624575124332874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document +0.0003522684124250313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document +0.00035286996027653544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document +0.00034967623997256725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document +0.00035182649587602765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document +0.0003524892557026489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document +0.0003507642477451811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document +0.00036190408389835666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document +0.00035102739424880766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document +0.00035239718753257265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document +0.00035298076121821316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document +0.0003478704389752654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document +0.0003503109191567942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document +0.00035143250975654426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document +0.0003480663923069012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document +0.00035691540219998623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document +0.000348815437166351 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document +0.00035202073257766225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document +0.0003491569096274706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document +0.00035277390475511834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document +0.0003524972090026609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document +0.0003504854249750236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document +0.00034740238025423914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document +0.00034968015462277606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document +0.0003493798632762674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document +0.0003488202537862122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document +0.0003525461864643725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document +0.00034903815232825664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document +0.00035536982539258216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document +0.00034858083265155483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document +0.0003505014973608067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document +0.00035327984042622104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document +0.0003503286677453136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document +0.00035835274842442816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document +0.00034970302660275595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document +0.000357929573140149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document +0.0003517238649788585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document +0.00036097027318848475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document +0.0003502734074110026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document +0.00035801510806036273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document +0.0003568006373479869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document +0.00036128108717454636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document +0.0003563436883111686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document +0.00035559725321852463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document +0.00035089656006854944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document +0.000359453964362057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document +0.00035629498059104033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document +0.0003622207707090437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document +0.0003540946784512821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document +0.0003594750565232011 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document +0.0003566007415086991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document +0.0003562142599126134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document +0.0003569948186744601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document +0.00035166554847920186 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document +0.00035047994419295137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document +0.0003561578193739437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document +0.00035470866838811544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document +0.00034216920464876335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document +0.0003550021513075795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document +0.0003488045105938729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document +0.0003513340720840151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document +0.0003448558566387584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document +0.0003460966026953241 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document +0.0003488157616036459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document +0.0003446120387842362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document +0.000351528602987427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document +0.00035661118227454713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document +0.0003551342699877457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document +0.0003478953397924445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document +0.00034625782458988215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document +0.0003527515447405871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document +0.00034823744889805696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document +0.00034823314560254406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document +0.00035162668292961944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document +0.0003477307716074623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document +0.0003446457989477787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document +0.00034782916273767795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document +0.0003517249130302248 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document +0.0003449873430908556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document +0.00034841291749669877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document +0.0003466028498941749 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document +0.0003486436831199424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document +0.0003478279234211838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document +0.0003495903653274374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document +0.00034896893881218957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document +0.000348941645312426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document +0.0003474221308416894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document +0.0003462621543839385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document +0.0003669373860863891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document +0.00034691156268163006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document +0.0003527774103765281 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document +0.00034684565672734663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document +0.0003454250599604457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document +0.0003541536557159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document +0.000345735737037366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document +0.0003524669816385214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document +0.0003441817133096468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document +0.0003519093265859089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document +0.00035080085480352095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document +0.00035285227929327434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document +0.00034354836346901676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document +0.00034789770937373467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document +0.000343665920520102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document +0.0003490884931060568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document +0.00034380029463398654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document +0.00034874768005099945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document +0.0003457058510967673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document +0.00034644265227023904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document +0.00035008339858594957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document +0.0003462377193296194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document +0.0003620491787114201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document +0.000348717011044469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document +0.00034370072363913706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document +0.0003551981066775649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document +0.0003500119496799342 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document +0.0003485082952669081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document +0.0003508155580978919 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document +0.00035311375163251416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document +0.00034945972003423253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document +0.0003474220353789879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document +0.0003536443686585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document +0.0003560350489042953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document +0.0003493655927914396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document +0.0003528423977146383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document +0.00035255554724471217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document +0.0003479760010190111 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document +0.00035458598862501956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document +0.0003458990560538315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document +0.00035157946422379875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document +0.00034736860650169996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document +0.0003529152313394119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document +0.00034586294329524465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document +0.00035707214923794877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document +0.0003509580363496512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document +0.00035244176725524474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document +0.0003467539557999047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document +0.00034919687962275546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document +0.00035094031731719953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document +0.0003484309008351352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document +0.0003485409424916253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document +0.0003499590776117838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document +0.0003492842758957848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document +0.0003529712275178912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document +0.0003566141287087449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document +0.0003649496522047409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document +0.0003563218912208234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document +0.00035614782126966145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document +0.0003531944298453266 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document +0.0003535950949566616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document +0.0003544295554928795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document +0.0003519908503740376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document +0.00035752817626134463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document +0.0003515322689589972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document +0.0003486893890307115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document +0.0003446520464889867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document +0.0003509421562481707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document +0.00035335015702909084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document +0.0003490178167345008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document +0.0003520497821155174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document +0.0003549762618908944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document +0.00035072190850833103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document +0.0003542458638526423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document +0.000352419194572916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document +0.0003545102564672614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document +0.0003495437992331806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document +0.0003542843376993964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document +0.000352827529313958 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document +0.00035442506093223886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document +0.0003496970719044257 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document +0.0003553096424442362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document +0.00034986845565067564 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document +0.000352131055186658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document +0.0003527021708198983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document +0.00034905885414547214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document +0.0003583433842468394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document +0.00034409435202828383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document +0.00034846410520871483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document +0.0003554459991927314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document +0.00035310507471843076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document +0.000350028910786098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document +0.00035049727458009896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document +0.0003519047735925826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document +0.0003513027429919726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document +0.0003626947260354396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document +0.0003500087324849783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document +0.0003618315726725285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document +0.0003535385113938023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document +0.0003487064058517615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document +0.0003618709124780938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document +0.00035040070335625915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document +0.0003506279032267829 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document +0.0003498435310527524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document +0.0003554634749821431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document +0.00035091209738758963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document +0.00035034103678978573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document +0.00035398931854386146 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document +0.00035495529304989485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document +0.00036067883473356603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document + diff --git a/ALCF/data-lists/polaris/megawiki.txt b/ALCF/data-lists/polaris/megawiki.txt new file mode 100644 index 0000000000..4c4f47df5f --- /dev/null +++ b/ALCF/data-lists/polaris/megawiki.txt @@ -0,0 +1,262 @@ +6.322825248625475e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document +2.4432314037946264e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document +5.6313888721313454e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document +2.4208171781595055e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document +2.325811856369237e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document +2.4010790356322705e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document +5.36773610843632e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document +1.360574433501002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document +1.3076540344853244e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document +1.3386534334886313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document +1.2498103719605153e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document +1.403763836949682e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document +1.3636756723495417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document +1.2242489446940814e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document +1.2398255818973339e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document +1.2972616994216281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document +1.3947809855914134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document +1.3144843787829514e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document +1.1693809976572487e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document +1.3677252682893802e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document +1.3940876719849597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document +1.4222245138730965e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document +1.3201677767919704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document +1.1421717796486169e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document +1.2890514724498703e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document +1.3649507648749037e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document +1.2400732563490717e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document +1.1557681453277616e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document +1.2294483595964517e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document +1.2137484472122283e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document +1.3299663426456e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document +1.2461984216479532e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document +1.4666434217609636e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document +1.1876997894686238e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document +1.2939155338964078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document +1.3859590039728515e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document +1.317917848615668e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document +1.1335281536110342e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document +1.2889923952861426e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document +1.3471671647053326e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document +1.2221720014475102e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document +1.2632647276287541e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document +1.28276219004076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document +1.36213704321643e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document +1.2414858625261553e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document +1.3173700421883744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document +1.295597796725686e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document +1.242783936442904e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document +1.2417374088427464e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document +1.2134479405400744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document +1.3090040663304255e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document +1.2713470581614905e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document +5.5750231378906594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document +5.777597358425469e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document +5.349786767471258e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document +5.675165050453583e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document +5.482611216158831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document +5.065421899890121e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document +5.384718357480146e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document +4.872037363236061e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document +4.532709250783155e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document +5.7257963030489613e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document +4.9014365579652036e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document +5.722863552770969e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document +6.149911636146833e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document +5.2178057608273506e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document +4.990228161160431e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document +5.866186875255134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document +5.004185734360719e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document +4.79401853705107e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document +5.435219965052376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document +5.035997225792266e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document +5.622401774211625e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document +5.028826157387559e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document +5.596379470128795e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document +6.027824493191489e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document +5.5358270009931474e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document +5.9839051807685496e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document +5.1221077499249595e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document +5.517228560620279e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document +5.1687858285052305e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document +5.684188244145645e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document +5.212693275535878e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document +4.8551007022784084e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document +5.4888506639203145e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document +5.345098688527242e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document +4.8506420625516594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document +5.132168603397676e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document +5.719476795114223e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document +5.7448621149792696e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document +4.9068410568059265e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document +5.382937299647678e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document +4.8288432136304634e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document +5.841703200305416e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document +5.1589611587885584e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document +6.031113829732574e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document +5.4558202844532094e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document +5.341852317196142e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document +5.1402942738369954e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document +5.735421384377395e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document +5.473629863586958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document +5.4708993245733936e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document +4.931161863634078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document +5.104173022127248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document +5.510157161510824e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document +5.652501401782597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document +5.7273656573031666e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document +5.638363224821738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document +5.6128115396668704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document +5.00304877998141e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document +5.596120554779096e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document +5.5280923889040006e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document +5.223477917938408e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document +5.29472809986569e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document +2.205682378243213e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document +1.4367563720603185e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document +3.5506193487931076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document +3.0442910855821778e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document +2.2540042508019627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document +2.6880163202623216e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document +2.534473148048727e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document +2.6560945431318916e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document +2.547470248967691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document +2.5248825388073738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document +2.5828729575000054e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document +2.4026583817957736e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document +2.3930425429834413e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document +2.5037365362599724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document +2.6696745470595603e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document +2.140323051341762e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document +2.617354786691592e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document +1.538359101762691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document +1.2871029252377856e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document +2.255195411289217e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document +2.4832313897952067e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document +9.303873918189968e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document +2.179532302620228e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document +1.9750517506901206e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document +2.7740420380648435e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document +2.7813714782319335e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document +4.1595357937609806e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document +2.741365122389175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document +2.117451071361901e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document +1.7132649760565998e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document +1.7492547092602047e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document +1.7499951097392276e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document +1.6632444789170958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document +1.6678802252361607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document +1.5519208704558896e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document +1.652420992967167e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document +1.6119931034508755e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document +1.6638882076736552e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document +1.7198076782652946e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document +1.572927860565175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document +1.5194822618169918e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document +1.6677776832669846e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document +1.595612492245688e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document +1.682350633181197e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document +1.663983380609724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document +1.710187842689243e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document +1.5733697527539038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document +1.6972104757911438e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document +1.6610142847616577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document +1.61094882403031e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document +1.4789207305138325e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document +1.639299617676302e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document +1.3241204512116132e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document +8.582260726625535e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document +8.213000975576739e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document +9.549247732811947e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document +9.17242785339013e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document +7.632868223725218e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document +8.674401118222175e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document +9.124384255505347e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document +8.344222222417358e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document +8.992299957499065e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document +8.76689497361025e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document +7.973396239586015e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document +9.006935606644125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document +8.725545954955498e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document +1.215449694669174e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document +3.3041720284158646e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document +2.0593512412624502e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document +1.893608946986248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document +1.737111666788535e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document +1.4915923449873955e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document +2.289370239067605e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document +2.8615335689614638e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document +8.847283630883125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document +1.8175470362373804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document +1.8152226683368038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document +1.789149655314284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document +1.7690523036477663e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document +1.8333732213753644e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document +1.8794105687718654e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document +1.721841156706417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document +2.0612008685724796e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document +1.9297370681336376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document +2.0188440409661018e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document +5.1741216329695265e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document +1.3417913926038429e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document +1.1010813016469651e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document +1.1252416134320087e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document +1.2801744104313002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document +1.3041514955795817e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document +1.3428837580879075e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document +1.320809382267804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document +1.3451566676555968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document +1.228284926657501e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document +1.2410599573923043e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document +1.3815343367377182e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document +1.3895126265148832e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document +1.2306773644401741e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document +1.32981021906281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document +1.101337469221607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document +1.513094184404692e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document +1.1073759547073234e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document +1.2879348765857567e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document +9.619595770228435e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document +1.2384340836286436e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document +1.1766667232211577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document +1.2871049236196452e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document +1.2010645926497744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document +1.3971428231518597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document +1.2283733550547932e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document +1.2659530508255308e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document +1.551775613074462e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document +1.1169413343776979e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document +1.1433700593712463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document +4.964773647323492e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document +1.0995586595687313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document +1.2957393071411267e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document +2.75899247407709e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document +2.8269344597344854e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document +2.329108187246831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document +2.4231761430460284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document +1.2434140512230442e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document +1.638718338352859e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document +3.272953556801187e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document +6.061314500486327e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document +1.2465979731210292e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document +1.2737557327967737e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document +1.038428658075627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document +2.61666472045566e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document +3.6506873212272224e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document +1.5066359138295701e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document +1.1166290872121178e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document +1.5546966228590285e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document +1.2583434625014828e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document +1.3398826881300862e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document +1.2944933160515968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document +1.0971437399901365e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document +1.2787922795775774e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document +1.404979227816985e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document +1.3344734431324463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document +4.886031157107555e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document +3.277261443596394e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document +3.5057957685786495e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document +3.287625301718589e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document +3.1370056372668855e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document +3.186092015785841e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document +7.271819324142512e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document diff --git a/ALCF/data-lists/polaris/open-web-math-train.txt b/ALCF/data-lists/polaris/open-web-math-train.txt new file mode 100644 index 0000000000..caab74fb9f --- /dev/null +++ b/ALCF/data-lists/polaris/open-web-math-train.txt @@ -0,0 +1,13 @@ +0.001451215788905126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document +0.0014486847196258788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document +0.0008861032722895899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document +0.0018119590809459816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document +0.0008916937917547129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document +6.960128832809415e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document +0.002008403651063623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document +0.0014374900742131454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document +0.00180213596996716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document +0.001956178877532413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document +0.0008829547017667033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document +0.0008910853619157279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document +0.0018260998845299973 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document diff --git a/ALCF/data-lists/polaris/pes2o.txt b/ALCF/data-lists/polaris/pes2o.txt new file mode 100644 index 0000000000..7bb62d46b6 --- /dev/null +++ b/ALCF/data-lists/polaris/pes2o.txt @@ -0,0 +1,26 @@ +0.0012499632072059553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document +0.00125398260359913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document +0.0012541704774729071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document +0.0012527268234360602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document +0.0012532925243737164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document +0.0012456396241204315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document +0.0012589894424352072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document +0.001508020123999618 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document +0.00333096950781965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document +0.0033233414614415547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document +0.003512387990689828 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document +0.0035091382940513126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document +0.003514155927147005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document +0.003327108000579638 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document +0.003329106196589836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document +0.003505604148738077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document +0.003324825759567855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document +0.0033248240149804913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document +0.0033385962112851358 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document +0.0035043186296553615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document +0.003340469505431529 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document +0.0035106889084796276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document +0.0033309469281030167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document +0.003340337858029757 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document +0.003505919861097801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document +0.0003882924098240512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document diff --git a/ALCF/data-lists/polaris/reddit.txt b/ALCF/data-lists/polaris/reddit.txt new file mode 100644 index 0000000000..4f46ee0e64 --- /dev/null +++ b/ALCF/data-lists/polaris/reddit.txt @@ -0,0 +1,78 @@ +0.0005759963691850877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document +0.0005959971675332674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document +0.0006026179290353799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document +0.0005824184320784846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document +0.0005854598548616037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document +0.0005903767055633473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document +0.0005930306490982049 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document +0.000569425602700746 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document +0.0005675060415179408 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document +0.0005772431621253389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document +0.0005678026053826858 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document +0.0005700398263483378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document +0.0005669467963528824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document +0.0005701015953324305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document +0.0005795907287413296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document +0.0005735602737531164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document +0.0005749862745842101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document +0.0005693257015931971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document +0.0005716568794795563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document +0.0005761083919774021 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document +0.0005688343169797355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document +0.0005807913190929842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document +0.0005710229258078636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document +0.0005704083039826862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document +0.0005862132348308056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document +0.0005717662049559556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document +0.0005858155213694451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document +0.0005812012281792392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document +0.0005803981414588498 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document +0.0005700102108287723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document +0.0005719243459052329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document +0.0005867253401661752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document +0.0005731087218860733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document +0.0005712197789109317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document +0.0005702376926310089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document +0.0005700411527742972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document +0.0005828090098178196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document +0.0005770140826168056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document +0.0005723509664597896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document +0.0005755499231836962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document +0.0005636407438471367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document +0.0005640281556500104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document +0.0005633159058766496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document +0.0005638034311151449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document +0.0005630066273073224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document +0.0005631803831128559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document +0.0005631228881679657 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document +0.0005628178701487633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document +0.0005624448092256196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document +0.0005620957024062329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document +0.0005614201504177484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document +0.0005616890951464056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document +0.0005611348559279058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document +0.0005604238061828518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document +0.0005603301490194237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document +0.0005607291294548833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document +0.0005605234569930727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document +0.0005613778566640694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document +0.0005610248539992471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document +0.0005599977416780475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document +0.0005603632562116935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document +0.0005599177479509897 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document +0.0005595202318298379 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document +0.0005600975633499175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document +0.0005614075491213365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document +0.000612563885043477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document +0.0005515469909644413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document +0.0005526782014946906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document +0.0005472463408095445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document +0.0005502284746004587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document +0.0005414514790555363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document +0.0005513499500134784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document +0.0005391391454105187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document +0.0005415836910001838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document +0.0005208132468536551 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document +0.0005889827143132871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document +0.0005822520817765276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document +0.0004173155230758696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document diff --git a/ALCF/data-lists/polaris/stack.txt b/ALCF/data-lists/polaris/stack.txt new file mode 100644 index 0000000000..971329159b --- /dev/null +++ b/ALCF/data-lists/polaris/stack.txt @@ -0,0 +1,26 @@ +0.0009994361338078242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document +0.001087156194657966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document +0.0010667737163656816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document +0.0009602877882124873 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document +0.0008968956271971105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document +0.0009198034843762967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document +0.0009423901016715341 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document +0.0009674094553686345 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document +0.0009858331322519164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document +0.0009970593645879198 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document +0.0010027035193731686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document +0.0010128291154221853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document +0.0010215631382631918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document +0.0010288663771461238 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document +0.0010346219929285867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document +0.00104544019940344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document +0.0010525172676724333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document +0.0010609529620775127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document +0.0010725892748610153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document +0.0010818563598181568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document +0.0010992760196793917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document +0.0011178992762079917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document +0.001124687532085676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document +0.001118303661267191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document +0.0010206825575416534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document +0.0005512280117499715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document diff --git a/ALCF/data-lists/polaris/starcoder.txt b/ALCF/data-lists/polaris/starcoder.txt new file mode 100644 index 0000000000..c675b0b84d --- /dev/null +++ b/ALCF/data-lists/polaris/starcoder.txt @@ -0,0 +1,50 @@ +0.004474659408857016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document +0.00409944473890653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document +0.005137179939941845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document +0.005143172251066109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document +0.005206134363352808 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document +0.004892747858974329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document +0.004844731352552902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document +0.005308320169123755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document +0.005124709815666577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document +0.005424710744483826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document +0.00538244648861977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document +0.0029107284679086853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document +0.0026825258998444705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document +0.0026904503191419243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document +0.002687906577174073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document +0.002850165346048818 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document +0.005322698571717847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document +0.004450334290869719 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document +0.004700990083440683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document +0.003903568556500995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document +0.00390561515396931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document +0.0039046402900912262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document +0.003907454839379547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document +0.0038583224578603824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document +0.0037914116657695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document +0.003786665266798682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document +0.003792000802430658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document +0.00319266847466091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document +0.0032658716699838944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document +0.0034801959532460023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document +0.0028307012092022594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document +0.0028420360878146276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document +0.0028410455248484914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document +0.00283497183526842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document +0.002840187195459487 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document +0.0028398709431369834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document +0.004364722843422023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document +0.004093255713117101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document +0.004092331079566252 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document +0.004005326985579649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document +0.0036205502856964207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document +0.003625316793034984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document +0.003604743435602363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document +0.0035405823343673125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document +0.0041601413517253945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document +0.005886303658937057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document +0.003600909532810332 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document +0.0034941365817168658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document +0.0004992164842980224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document + diff --git a/ALCF/data-lists/polaris/tulu.txt b/ALCF/data-lists/polaris/tulu.txt new file mode 100644 index 0000000000..a65ae2b93a --- /dev/null +++ b/ALCF/data-lists/polaris/tulu.txt @@ -0,0 +1,66 @@ +0.00032927705604725614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document +0.0002860154190878753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document +0.0002845217585425619 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document +0.0002743528685497456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document +0.00026025323737738766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document +0.00023493876414603155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document +0.00029665994994226705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document +0.00031808102075993956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document +0.00031813573046011285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document +0.0002711905171855542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document +0.00028892513401817095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document +0.00030003908676979083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document +0.00026839878771944684 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document +0.00029155935002690497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document +0.0002998624927624209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document +0.0003091705447974841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document +0.00026873195794309786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document +0.00027721873498527547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document +0.0002841662554024377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document +0.0002839461156551537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document +0.0002861705604659811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document +0.0002460995649635886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document +0.00019420142619795496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document +0.00021967677816173628 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document +0.0002620283200480949 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document +0.0002433390542188936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document +0.00021254976608350767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document +0.00022094815569522115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document +0.000342862378668244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document +0.00033784225259118157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document +0.0003367278459543952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document +0.00029843279042852765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document +0.0002926583661257988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document +0.00029320337282010673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document +0.00029281450669483455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document +0.0002915338187002653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document +0.0002864226923084572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document +0.00028643439083586396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document +0.00028253710956299054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document +0.0002810856078805806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document +0.00031474941344656715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document +0.0002139130222205655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document +0.0003084648871862831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document +0.0003309477872140129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document +0.0003360096824695161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document +0.0003355452655196557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document +0.00038119390366386037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document +0.00038078927630086064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document +0.0003386200917551554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document +0.0002158905159938882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document +0.00021621682877018768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document +0.00021553306942740535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document +0.00021581563462722296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document +0.0002157694110556169 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document +0.000215643699847159 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document +0.00021532716715168094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document +0.00021531221326022472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document +0.0002831801179028896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document +0.0002514844936507595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document +0.00031638782778107964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document +0.0002749197545278445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document +0.00026159721512464495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document +0.0002630052420096968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document +0.00031106811228913666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document +0.0002852973415334161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document +3.7555372465932136e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document diff --git a/ALCF/data-lists/polaris/wiki.txt b/ALCF/data-lists/polaris/wiki.txt new file mode 100644 index 0000000000..7759120d87 --- /dev/null +++ b/ALCF/data-lists/polaris/wiki.txt @@ -0,0 +1,2 @@ +0.003548077173506675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document +0.0018372203137874265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document diff --git a/ALCF/data-lists/sirius/books.txt b/ALCF/data-lists/sirius/books.txt new file mode 100644 index 0000000000..7567ba5227 --- /dev/null +++ b/ALCF/data-lists/sirius/books.txt @@ -0,0 +1,3 @@ +0.006 /lus/tegu/projects/PolarisAT/foremans/projects/argonne-lcf/Megatron-DeepSpeed/data/books-0000_text_document +0.006 /lus/tegu/projects/PolarisAT/foremans/projects/argonne-lcf/Megatron-DeepSpeed/data/books-0001_text_document +0.006 /lus/tegu/projects/PolarisAT/foremans/projects/argonne-lcf/Megatron-DeepSpeed/data/books-0002_text_document diff --git a/ALCF/data-lists/sunspot/algebraic.txt b/ALCF/data-lists/sunspot/algebraic.txt new file mode 100644 index 0000000000..0f25e30498 --- /dev/null +++ b/ALCF/data-lists/sunspot/algebraic.txt @@ -0,0 +1,16 @@ +0.0018520780893211373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document +0.0017591050606817512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document +0.001459052794333798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document +0.0007405667281569194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document +0.00019420030110896795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document +0.0009008668715801845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document +0.00015115827957143057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document +0.0014552844319220648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document +0.0012469861325685161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document +0.00136412011372413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document +0.0007064279699221103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document +0.0008472240000687427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document +0.0001984375713341955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document +0.0005472773881697123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document +0.001815779629850992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document +0.0018313600689757324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document diff --git a/ALCF/data-lists/sunspot/arxiv.txt b/ALCF/data-lists/sunspot/arxiv.txt new file mode 100644 index 0000000000..c50df90503 --- /dev/null +++ b/ALCF/data-lists/sunspot/arxiv.txt @@ -0,0 +1,100 @@ +0.0002583902668716813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document +0.0002646575141232155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document +0.0003165521247456758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document +0.0002920706460176214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document +0.00028396813182810215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document +0.00030445161883108107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document +0.00031628781276576474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document +0.0003083776568189157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document +0.0003176359471472902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document +0.0002536009369131698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document +0.0003067491424681363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document +0.0002597217257557784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document +0.0003788556450109768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document +0.0002796563272052598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document +0.00033573826524290287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document +0.00030523658022800287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document +0.00032211552192240096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document +0.0003329295675164247 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document +0.0003101982186639862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document +0.00032361798234223355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document +0.0003495541581652915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document +0.0002821637448858042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document +0.00030399523537629673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document +0.0002955658968247219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document +0.00028942158502924254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document +0.00028769546171490733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document +0.0002938111057234182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document +0.0002711150403010948 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document +0.00031130095874747565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document +0.0003002996118160777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document +0.0003732757901604459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document +0.00026784205751795894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document +0.0002799626521661984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document +0.00034334276069078164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document +0.0003582469803674965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document +0.00031094844818418623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document +0.0002766228384977191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document +0.00030297116159471485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document +0.00027033888377464685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document +0.00030090862368377933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document +0.00028543875802490955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document +0.00027559768459074204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document +0.0003182185533962886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document +0.0003311392971435837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document +0.00028751652060804325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document +0.000303466863212589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document +0.00033400462801277524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document +0.0002589234031777426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document +0.0002913508598466723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document +0.0002670572450004856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document +0.00032027399105647656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document +0.00032188376258379377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document +0.0003161585784100882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document +0.0003184249182974135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document +0.00030381336664000807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document +0.0003190437442184283 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document +0.0002537961798200545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document +0.0003017817117223326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document +0.00028685268513240224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document +0.00031265179094451165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document +0.00034708319096986816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document +0.00026650837943080664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document +0.00034588832248507335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document +0.0002416982248399037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document +0.0003089296918222243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document +0.00029137184185700827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document +0.00026464226846800774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document +0.00030545397919456627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document +0.0003206778460448875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document +0.00030968971641110967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document +0.00023325653928600864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document +0.00030526899198338555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document +0.00035376719076633584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document +0.000290224385981026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document +0.000294650083382008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document +0.00028768858128616436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document +0.00030856965235527843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document +0.00030579942447879054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document +0.0002863101084704357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document +0.0002870032092492213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document +0.000264182727569885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document +0.0002974012367036449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document +0.00032238412143059203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document +0.00031683716893819036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document +0.00031157434937617524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document +0.0003411742735695989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document +0.00026778444816570715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document +0.0003037045797275201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document +0.00027746114370081314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document +0.00027148285946862043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document +0.00028042950114678207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document +0.0003235607816590721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document +0.0003086692227306295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document +0.00033990349455148105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document +0.00030945053208470265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document +0.00027309074552265303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document +0.00028737393506316194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document +0.0003098868328009879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document +0.0002614229162588409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document +0.0002884388407820923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document diff --git a/ALCF/data-lists/sunspot/books.txt b/ALCF/data-lists/sunspot/books.txt new file mode 100644 index 0000000000..7aa37a00d2 --- /dev/null +++ b/ALCF/data-lists/sunspot/books.txt @@ -0,0 +1,3 @@ +0.0031025147279277244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document +0.003102019887362634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document +0.0009996745994661548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document diff --git a/ALCF/data-lists/sunspot/c4.txt b/ALCF/data-lists/sunspot/c4.txt new file mode 100644 index 0000000000..9504bcbfe6 --- /dev/null +++ b/ALCF/data-lists/sunspot/c4.txt @@ -0,0 +1,171 @@ +0.0002406272620255565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document +0.0002404825539493424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document +0.00024062296575435581 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document +0.00024069315766818953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document +0.00024055829162263452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document +0.00024062053397343032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document +0.0002410715545206964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document +0.00024024881846087368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document +0.0002407074700790688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document +0.00024072141428809043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document +0.00024027710230872736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document +0.0002409111299205489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document +0.00024081954058275009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document +0.00024086076794990912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document +0.00024098672620832446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document +0.00024068622303333862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document +0.00024140627024291824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document +0.0002414512033594384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document +0.00024028742594941463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document +0.00024018036089269645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document +0.0002398347365034979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document +0.00024006780153485276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document +0.00024015620270419213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document +0.0002408848259695227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document +0.0002408023185278831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document +0.00024021196580140326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document +0.00024077677271297493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document +0.00024087392454668027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document +0.0002408071293824126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document +0.00024042223828845715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document +0.0002411484752360495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document +0.00023605263746465907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document +0.00023471222158326908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document +0.00023432138580287644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document +0.00023407385623382327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document +0.00023487504174367091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document +0.0002341843704976313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document +0.00023421993170282486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document +0.00023445057969132037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document +0.0002337681680073047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document +0.000234627964808109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document +0.0002338942211888584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document +0.00023403849286843386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document +0.00023405641310796305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document +0.00023349169562397965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document +0.00023381157386048856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document +0.00023388742993790587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document +0.00023363103829469813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document +0.00023421141834630477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document +0.00023420564352232565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document +0.00023367463699173143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document +0.00023344969163567033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document +0.00023372196941547188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document +0.00023399207645297834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document +0.00023357915605505856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document +0.00023337585642190864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document +0.00023385005470157914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document +0.00023301533534493465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document +0.00023377864302541782 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document +0.00023323745848621437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document +0.0002330594611151835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document +0.0002334149675026783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document +0.00023198945902291534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document +0.00023023784834634142 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document +0.00022985623060187217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document +0.0002292605284569516 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document +0.00022926593333048894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document +0.00022922766406807777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document +0.00022898153911167426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document +0.0002292473111593315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document +0.000228804579400424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document +0.00022865485613513526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document +0.00022937426835887895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document +0.00022917388311587372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document +0.0002291660582019043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document +0.00022907895248360543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document +0.0002294617879920205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document +0.0002290452150516566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document +0.00022943405619715553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document +0.0002296271421006204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document +0.00022854791372910372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document +0.00022923123467686557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document +0.00022852404355738494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document +0.00022847798660086642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document +0.0002289604586810316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document +0.00022835479834950643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document +0.0002289149402884243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document +0.00022806655474763446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document +0.00022826296420992974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document +0.00022906829636213627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document +0.0002287628414466998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document +0.0002282673911253445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document +0.00022869309841939134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document +0.0002281540116815451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document +0.0002259755756162738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document +0.00022562331285233504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document +0.0002259061146106053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document +0.00022567670836663787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document +0.00022573165387587061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document +0.00022508514961670572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document +0.00022564642513773356 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document +0.00022563088621998788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document +0.0002250438755373707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document +0.00022524465346241134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document +0.00022531737657666812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document +0.00022444687519363458 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document +0.00022460397498596298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document +0.00022454218976501763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document +0.00022447528843671366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document +0.00022501666332178926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document +0.00022453752304377972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document +0.00022484451871163002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document +0.00022465678847154914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document +0.00022453180917044732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document +0.0002247278486823009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document +0.00022465794828242097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document +0.00022431000701925386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document +0.00022476020248460963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document +0.00022467531771795015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document +0.0002236391309945234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document +0.00022458764920536007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document +0.00022430877426744415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document +0.0002247047786127192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document +0.0002245298090400035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document +0.0002245648831396188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document +0.00022292894729820784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document +0.00022236668082957533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document +0.0002217622659895442 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document +0.00022252452726732609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document +0.00022135333211363678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document +0.0002214571757787971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document +0.0002217188139237798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document +0.00022144214894640303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document +0.00022100172806631854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document +0.00022156392409199052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document +0.00022134830143710272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document +0.00022158598922529453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document +0.00022142932483041377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document +0.00022120980907786554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document +0.00022117917738112441 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document +0.00022077089397851235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document +0.00022093265074996711 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document +0.00022091299741377004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document +0.0002205849150703338 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document +0.0002210648204787979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document +0.0002214235747364102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document +0.00022083907302221787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document +0.0002206334237915964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document +0.00022065193929912214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document +0.00022079775597767288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document +0.00022091492909963518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document +0.00022095009987097293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document +0.0002208150577180165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document +0.00022085759102772088 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document +0.00022073789170129016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document +0.00022049322781182384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document +0.00022083270617761285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document +0.00021982452827473632 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document +0.00021899870446514259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document +0.00021890358773356361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document +0.00021875556609042841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document +0.00021861195987201226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document +0.00021856782186167455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document +0.00021912837771543515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document +0.00021900213768517756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document +0.00021871675851390374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document +0.0002180537056545586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document +0.0002188196714327129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document +0.00021851362624523464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document +0.0002183236795498736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document +7.291153618675672e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document diff --git a/ALCF/data-lists/sunspot/cc.txt b/ALCF/data-lists/sunspot/cc.txt new file mode 100644 index 0000000000..0a2a0ce35b --- /dev/null +++ b/ALCF/data-lists/sunspot/cc.txt @@ -0,0 +1,1108 @@ +0.0003742481815405742 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document +0.00038204855962733055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document +0.00038821818392663593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document +0.00038723332988783727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document +0.00038916141142149904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document +0.00038049542523949033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document +0.0003854755539534284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document +0.00024202756466512517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document +0.0003915405155008087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document +0.0003927382151931033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document +0.0003839151202260479 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document +0.00040006817468967907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document +0.00040318965964443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document +0.0003831013019452741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document +0.00039166638383204036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document +0.00039962784023961004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document +0.00039536707853602614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document +0.0004204304698247758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document +0.00041538899178693555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document +0.00039186953333675306 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document +0.00038945837196504305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document +0.0003919951238929062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document +0.00044377065718528966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document +0.0004407759068603017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document +0.0002487811895843715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document +0.00039349432045556636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document +0.00041223198559462343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document +0.0004036573014830213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document +0.0003825982215521807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document +0.00040386867133151386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document +0.00024460575279105167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document +0.000269029789531335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document +0.0003573757493252864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document +0.0004600876681392076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document +0.0002605354166397086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document +0.0003882502452157999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document +0.0002466747612126512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document +0.0004024726105072402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document +0.00040820631128483644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document +0.0002691094350403538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document +0.00026916830387277267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document +0.0004204663297880574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document +0.00042379698687085554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document +0.0004502169227311871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document +0.0002661708937015295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document +0.00031239486948031334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document +0.0003109054589936201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document +0.00045873053079760646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document +0.00022904931423244635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document +0.0003813462028433663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document +0.00039188129256500874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document +0.00045124222276983765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document +0.00048138658436853695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document +0.0003944178776279866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document +0.00039941569676754006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document +0.00037952761190240494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document +0.0003944870860881476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document +0.0003891842411856621 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document +0.000387688981934861 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document +0.00039197953876258005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document +0.00039007915280311206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document +0.0003995520363699188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document +0.00039230985654592406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document +0.0003929472067173851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document +0.0003924096172671473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document +0.0003881636143629905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document +0.000389790617937084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document +0.00037351762309221023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document +0.0003630196170929407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document +0.00033532465765142113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document +0.0003076088685761823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document +0.00039463850897720803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document +0.0002843816115231449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document +0.0002909175709416474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document +0.00028867170997202486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document +0.0002838644617723659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document +0.00029027869525543416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document +0.0002821339567560056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document +0.0002922988877045601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document +0.0002866955958315786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document +0.0002865271754558126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document +0.0002861247475618473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document +0.0002826681072408606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document +0.0002849746458282827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document +0.0002816966633435316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document +0.00026255342235948463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document +0.0002552895098829678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document +0.00025990194083107813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document +0.0002524062657685835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document +0.0002538577379748611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document +0.0002561415177406761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document +0.00026206253059694905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document +0.00026168095406910565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document +0.0002601305742008613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document +0.00025200823006814814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document +0.0003229951981263502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document +0.00037289448266476045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document +0.0003807825862179898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document +0.0003616333738191483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document +0.0003665117918907636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document +0.0003684186453633228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document +0.0003589330610806066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document +0.00036383861418030395 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document +0.000359841363355303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document +0.00036431044063050464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document +0.0003668574090358279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document +0.000362768263620199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document +0.0003501888032771077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document +0.000352401968221528 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document +0.0003541019701869794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document +0.0003628121865546891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document +0.0003752582953758773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document +0.00037902046230424966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document +0.0003777927146925147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document +0.0003760676130509053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document +0.00034046049078755405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document +0.0003338847563259091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document +0.00033294499102761794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document +0.0004912026198265864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document +0.00032064363474664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document +0.00032154190389541214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document +0.00032309660151746207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document +0.00031181143365304544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document +0.00031046092294569104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document +0.00031150165249068046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document +0.0003041314265988224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document +0.0003024834909739394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document +0.0003019936835833604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document +0.000292329665283177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document +0.0002867061143144972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document +0.00028443615610701707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document +0.00028462291013755945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document +0.0002793538601205013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document +0.00027306573977044246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document +0.00027097155673336525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document +0.0002752934202112985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document +0.00043042012694697647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document +0.00047495648822986177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document +0.00047755032493473855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document +0.0004706974343933747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document +0.00046682163297771817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document +0.0004616765425874178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document +0.00030644496751628097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document +0.0002909492555358308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document +0.00027272036068261724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document +0.0004101070217315588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document +0.0003728914338834357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document +0.00036546911442305647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document +0.0003669945482407483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document +0.0003715902407424017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document +0.00035837486406683366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document +0.0003573318538685469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document +0.0003553784893071916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document +0.0004920659809912352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document +0.0004533619411303183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document +0.00045067066057818706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document +0.00044396985139270645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document +0.00043198288204468477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document +0.00043005174223738454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document +0.00041847118430776784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document +0.00042952036375796664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document +0.00043420594647324267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document +0.0003461123241053012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document +0.0003408581597849182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document +0.00033172705422182547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document +0.0003392566490686136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document +0.00033578341518385483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document +0.0003439196710518844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document +0.00034559163447085543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document +0.00033762478642902825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document +0.00033215210055107224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document +0.00033423579608014966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document +0.0004963355016025102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document +0.0004996862761456923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document +0.0005000551829325451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document +0.0005004212610098755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document +0.00027768695585500585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document +0.00028395983854338433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document +0.00027835826303062254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document +0.0002740073176010804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document +0.0002791830529274016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document +0.0002796863816194411 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document +0.00026697453022672804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document +0.0002594197440280141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document +0.0003779565697649222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document +0.00041835823476586606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document +0.00043788493575265915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document +0.0002731731970096006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document +0.000276305847423402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document +0.0002704955773958623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document +0.0002629635944827518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document +0.000260070956974436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document +0.00025661553791456334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document +0.00025794727207576157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document +0.00025295733980001527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document +0.0003788106407021029 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document +0.0004882344027669431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document +0.0003275324309642705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document +0.0004803401856640094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document +0.00046720138323433943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document +0.00043527810307095335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document +0.00043905395741627827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document +0.00048774175867331425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document +0.00048380704121346737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document +0.0004779011848346118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document +0.00046255587581908036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document +0.00045127922880511576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document +0.0004503891485256095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document +0.0004450142332303422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document +0.00044630282482516654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document +0.00044325014465743616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document +0.0004263874842796447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document +0.0004217530913646938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document +0.000415120314341852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document +0.00040987168279144537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document +0.00033468337266607834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document +0.0003353094464683005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document +0.0004833936821707294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document +0.00047194878988920935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document +0.0004648324126996427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document +0.0004562345003964941 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document +0.0004933203505465098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document +0.0003530166075325466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document +0.00035368548192804685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document +0.0004872620828289663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document +0.00048293889392426456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document +0.00047936768462267655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document +0.00047821013991587545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document +0.0004660610308564753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document +0.000394683430103437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document +0.00039165053441571324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document +0.0003906936040164381 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document +0.00038074803919159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document +0.0003686529291578143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document +0.00035832920428870976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document +0.00035929024535947033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document +0.0003538226556050544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document +0.0003584167868708799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document +0.0003480507542594234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document +0.0003413709023543034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document +0.00034001304759361455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document +0.00033430532902756514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document +0.00046519252660631277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document +0.0002938876402514769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document +0.00028676090994509047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document +0.00027296150117506716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document +0.00026513502621960483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document +0.0002680081327926125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document +0.00025831225828720344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document +0.00026647037295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document +0.0002525733734572654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document +0.00025831708887575375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document +0.00042487627444443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document +0.0004951213245023891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document +0.0004804051413177752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document +0.0004662397611340532 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document +0.0004550138655253933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document +0.00044494909122746795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document +0.0002899112253051385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document +0.0004372879736279761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document +0.0004529568099252922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document +0.00045127826158829573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document +0.0004436558176737439 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document +0.0004419233237678378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document +0.000434589215880319 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document +0.00029153613207706566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document +0.0004312458058738854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document +0.00028741854968757313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document +0.00046853200754421234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document +0.0004949145252030074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document +0.00044459683920483167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document +0.0003836095306696336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document +0.0003789760237872398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document +0.0003749227438304427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document +0.0003628558277173369 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document +0.00039468301394041474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document +0.00038874701821614864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document +0.0004158492456077867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document +0.00042360504554060077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document +0.00040386729844317623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document +0.00027595096702902474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document +0.00043638766787829135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document +0.0002218691596850179 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document +0.0004437566108089954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document +0.0003889996411609667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document +0.00043454421906537704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document +0.0004522564392830988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document +0.00041517835659357416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document +0.0002614360863446896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document +0.00037543522111463596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document +0.0004386190133514781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document +0.00046358333286115075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document +0.00043186261317942404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document +0.0002377581602097957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document +0.00025973334085074254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document +0.00040139099332000796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document +0.00043674860686687174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document +0.00040853289309329373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document +0.000242910191729688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document +0.0004431071731750582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document +0.0004388092670482523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document +0.000381418866255965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document +0.0004100117296419717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document +0.00042469230366022745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document +0.00041744151905374254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document +0.00022835699906752945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document +0.0004380161085387397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document +0.00044803212381807456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document +0.00040554932796137236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document +0.0004234508646347761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document +0.00043341209652360653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document +0.00023966604734537185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document +0.000259165907316014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document +0.0004270653021833602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document +0.0004341547032162028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document +0.0004111478117275994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document +0.0004299383567984396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document +0.0004241899124590779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document +0.0004502719349364145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document +0.00038994621469645615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document +0.0003859912398894952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document +0.0004247535950310557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document +0.000386982084327716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document +0.0004196451040053251 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document +0.0004096278509782259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document +0.0004373334932695721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document +0.0004180889975240641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document +0.00042079636929672745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document +0.00038063574611812913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document +0.0003817505891515542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document +0.0004420096268860222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document +0.00039182670726410623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document +0.0003635667850372299 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document +0.00041564996472055667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document +0.000400529358757286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document +0.0003939113874958451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document +0.00039066622068940996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document +0.0004290098538807143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document +0.0004240739958197099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document +0.00040775392659215333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document +0.0004091634200396925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document +0.00042299190476617914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document +0.0003701492680344151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document +0.0003807353844384635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document +0.00038813507771983156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document +0.00040072346558408346 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document +0.0003603595180423597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document +0.00038799421353112465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document +0.00037575235582264926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document +0.0004239190342959713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document +0.0004606044799136546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document +0.00045107950652529253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document +0.0004391947201871058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document +0.0004457516661123035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document +0.0004301297170991686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document +0.00044661704164586694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document +0.0004438849846114837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document +0.0004444205734316823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document +0.0004190924165303394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document +0.00043942581131677875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document +0.00021568459798090663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document +0.0003814929225407199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document +0.0003217453179359235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document +0.00031719591470267974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document +0.00032434115726922137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document +0.0004079911120371051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document +0.000329492766381148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document +0.0003845916162001633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document +0.0003835208964390098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document +0.00037847334157173194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document +0.00038296039903791865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document +0.00037896336828472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document +0.00037620974396391355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document +0.00037420590727111843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document +0.000340490625886403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document +0.0003078314411035827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document +0.00034153990750656097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document +0.0003308858103982067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document +0.0003452640607156025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document +0.00033095276418403455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document +0.0003116308995860414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document +0.00032446713226408477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document +0.0003015816821912984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document +0.00031612418775706894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document +0.0003278516344971041 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document +0.00033079446736097217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document +0.00032278977146550837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document +0.00032065272988207914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document +0.0003936696452406576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document +0.0003450109536627789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document +0.0003339787189919641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document +0.0003284303856176974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document +0.00033652677276843477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document +0.0003257822443845694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document +0.0003293985569149334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document +0.0003310360260148262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document +0.0003233770986418526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document +0.0003172280092149422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document +0.0003160674744292835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document +0.00030931090289598506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document +0.0003093173886443107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document +0.00033167847081104083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document +0.00031131501311729723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document +0.00031046608876279845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document +0.00030569235942207244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document +0.00030777943671285197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document +0.00029303314290956683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document +0.0003045824546400205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document +0.00030360880677729793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document +0.00031646239964835433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document +0.0003129122300603785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document +0.00031060464956661433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document +0.000311819032500067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document +0.0002977872483902282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document +0.0003009448600922438 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document +0.00028610292098537774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document +0.0002988326876216654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document +0.00028550828372819075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document +0.0002830381750875739 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document +0.0002848495855927156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document +0.0002856443760308144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document +0.00027442895344188584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document +0.0002681160554049462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document +0.0003421482544126989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document +0.0004005872948449718 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document +0.0003930123959320308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document +0.0003867271832275778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document +0.000380805140455254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document +0.0003814769861947819 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document +0.00038025170883282324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document +0.0003738026647867475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document +0.00018960856915036276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document +0.0003697177501953134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document +0.00036674194328136693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document +0.00036447406838697555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document +0.00036686410861101255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document +0.00035915267825103423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document +0.0003624758404026675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document +0.0002822812140180794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document +0.00030620512946920813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document +0.000294249776520589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document +0.00030238536967523434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document +0.00029509593361580754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document +0.0002906912701830899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document +0.0002921944165474959 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document +0.00028358919691127954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document +0.0002813182772323272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document +0.00027442640800299205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document +0.0002747820342933984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document +0.0002747584403979717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document +0.00027499129634862444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document +0.0002712050404257197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document +0.0002616256943143254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document +0.00026769938929002815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document +0.00038396081322727017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document +0.0003863140490027991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document +0.00037702277513203237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document +0.0003633274156107032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document +0.0003587473889240435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document +0.0003507672084278415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document +0.00033776425499780385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document +0.0003377914127574796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document +0.00032948015659161326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document +0.00033245638541392985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document +0.00031080707640648695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document +0.0002976903331149755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document +0.0002965121463725523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document +0.0002933849695266647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document +0.0002837035078508233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document +0.00028684569079589323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document +0.0003145192320802359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document +0.0003566937253273515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document +0.0003470199109592918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document +0.0003060245312041868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document +0.0002650817213818789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document +0.0002643604938780134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document +0.000299350876031416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document +0.0003178540797697938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document +0.000271850367887767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document +0.00031349896596549 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document +0.00031749734412765755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document +0.0003791137842391209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document +0.0003742334169957992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document +0.0003705639757351107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document +0.0003126986769797042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document +0.00031038132814561196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document +0.00036464437173804883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document +0.0003569480488951322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document +0.0003541239221619106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document +0.00035315297411308053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document +0.0003572451925404141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document +0.0003514986129411253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document +0.0003521798298425866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document +0.00034553677439244716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document +0.000349004719809412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document +0.0003468247484872769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document +0.0003465822608356558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document +0.00035410983132162007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document +0.0003487908354969444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document +0.0003479024763238147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document +0.000341412530646823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document +0.00034451316273667034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document +0.0002618849993484869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document +0.00026788679978901144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document +0.00027450670773227214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document +0.0002661273129899329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document +0.00026836569676402957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document +0.00026155876975483236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document +0.0002609276830117151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document +0.0002644161630512771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document +0.00036789208972872557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document +0.00037829849439990513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document +0.0003788894943523098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document +0.0003617207777959397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document +0.0002541334487248998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document +0.0002707945538071073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document +0.00027046282716455214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document +0.0002652443167243215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document +0.0002685859923850986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document +0.00025734961751176414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document +0.000259041720872915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document +0.00025340107274823446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document +0.00025757135121837893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document +0.00025617700500574084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document +0.0002566931670562857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document +0.0002543871190716101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document +0.00024997565589481713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document +0.0002954079779456287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document +0.00034890741135252835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document +0.0003473298137731525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document +0.0003296959618486435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document +0.0003304520061604598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document +0.00032377956175729824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document +0.00031700696295168713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document +0.0003060382346081943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document +0.0003012003005056863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document +0.0002981074073993884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document +0.0002922128825950705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document +0.000348901087722931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document +0.0003408286289467841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document +0.0003410649680770183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document +0.0003358524215576502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document +0.0003343661874989231 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document +0.00032810573699389156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document +0.00032261449539097497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document +0.0003162694866049203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document +0.0003158381156468853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document +0.000317376061083603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document +0.0003125788639953052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document +0.0003010105041885602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document +0.0003065865059090678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document +0.0003084275726508053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document +0.00030966560718296085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document +0.0002957728057853081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document +0.00029904164542325336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document +0.0002955358888729187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document +0.00028692976446931544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document +0.0002923476214935797 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document +0.0002893691697212419 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document +0.0002855895211981585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document +0.00027968347097626246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document +0.0002810783462604979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document +0.00027794080455729715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document +0.00034784376461416953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document +0.0003488347959010943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document +0.00034790583710250724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document +0.000345913166618151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document +0.00033801936268066675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document +0.0003290591130212315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document +0.00034051399521366823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document +0.00032470943131841784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document +0.00031679540050914276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document +0.00031814596342422325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document +0.0003156466289485036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document +0.00029985010879003633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document +0.0002905176377776361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document +0.0004206836775460856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document +0.00020660449162246918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document +0.0003461727254468087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document +0.00020592870907067763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document +0.00034173505299233005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document +0.0004052437256652738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document +0.0004080650901351697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document +0.00039778184149144276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document +0.00039046311464950275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document +0.00039043444911071384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document +0.000388575704932843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document +0.00019737533145666597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document +0.00037610755595812403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document +0.00037315400127598317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document +0.00037415028580922163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document +0.00036694041707212337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document +0.00018947219857306515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document +0.00037046050826533545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document +0.0003587440768559087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document +0.00034623936498708903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document +0.0003502289592617922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document +0.00034692398063649823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document +0.000339340809421849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document +0.0003360510394816983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document +0.0003354673850814145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document +0.00032937682875877047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document +0.00032844505049317715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document +0.00028287199339908627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document +0.0002795217197003578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document +0.00028048955601883463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document +0.0002769326396439027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document +0.0002727090021299243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document +0.0002726577841024554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document +0.00026663619593455374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document +0.00026068042672138127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document +0.0002637704114326801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document +0.0002593043567100412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document +0.0002599897110113453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document +0.0002435078682758859 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document +0.0002450530071379054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document +0.00024233331983743606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document +0.0002934750947999535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document +0.00033241226364044474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document +0.00032938406090272075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document +0.00032778705403953246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document +0.00032184551480398754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document +0.00031874002264945737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document +0.0003165319685666433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document +0.00031307071173376295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document +0.00031119524184911957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document +0.0003102253344576429 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document +0.0003088976240383192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document +0.0002951410823077708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document +0.00029772657676757413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document +0.0003056048989909935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document +0.00031991305381648026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document +0.00030890256978362426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document +0.0003109382904091933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document +0.00031035798529690644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document +0.00030741666395911753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document +0.0002989918594861846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document +0.00029569635443989434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document +0.0002973992445667285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document +0.000293397351001072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document +0.00028737817438047954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document +0.00028252738144009747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document +0.0002805511898623541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document +0.0003718020784620472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document +0.0003499713845765235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document +0.00034283547445326676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document +0.00031464759888838765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document +0.00033188946446414833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document +0.000326084432195463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document +0.0003764568303917893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document +0.0003604955598858414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document +0.0003655654554133222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document +0.00035762304033750504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document +0.00038478883950347103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document +0.00027735714341247454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document +0.00028139534607773563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document +0.00019777292251713763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document +0.000285571704874486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document +0.00028543482146244363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document +0.00019434234484256758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document +0.00027854908176986763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document +0.0002847068039566143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document +0.00028672356943064853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document +0.00027782687605808177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document +0.0002843539634105203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document +0.0002894748379090401 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document +0.0002868852440186493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document +0.0002818504885373851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document +0.00028680112812941034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document +0.00019258978168723977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document +0.00028760637934715155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document +0.0002820439443912918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document +0.0002831001054410018 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document +0.00029001901552467397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document +0.00027779449377883156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document +0.00019949837437516796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document +0.0002907306472984446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document +0.00027814858381318327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document +0.00019472790889161432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document +0.00020472626596924125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document +0.0002870045081974301 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document +0.00019812241927078482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document +0.0002817553333369554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document +0.00027829782796642117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document +0.00028289431732284113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document +0.0002795526296717729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document +0.00027682829988044574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document +0.0002895432402719184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document +0.0002823174903941811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document +0.00028170972351837796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document +0.00027807915877838826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document +0.00028588515681452956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document +0.00028112324090816726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document +0.00020636178289985485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document +0.00019447255290980535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document +0.0002850824220591452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document +0.00027856429520116784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document +0.0002820880676635633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document +0.00028943902215995714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document +0.0002676366291085329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document +0.00023806333809954687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document +0.00024526460430233455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document +0.00023876876664622726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document +0.00023379770334179805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document +0.00024175151269138382 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document +0.00023386583242595706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document +0.00023771797150160827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document +0.0002262748967483896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document +0.0002408148346432682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document +0.00023398651720444235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document +0.00022989433874474592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document +0.00023948500543957772 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document +0.0002331594076859196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document +0.00023375132439600242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document +0.00023923410909668642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document +0.00023952796315562954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document +0.0002327466076905069 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document +0.00023082758956797212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document +0.0002240509275524448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document +0.00022798879995765268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document +0.000221172516774386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document +0.00021767045123534623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document +0.00021982832794804484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document +0.00021971626543789102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document +0.00022566565206920132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document +0.0002181984894194856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document +0.00021831417549554653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document +0.00021601405421187145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document +0.00022275733725519607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document +0.00021847734911973986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document +0.0002243591012664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document +0.00021688758139483833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document +0.0002182953624789215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document +0.00020475155724026002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document +0.00021498078062960065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document +0.0002157914337233064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document +0.00021781838494967963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document +0.00021723242266814558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document +0.0002176782686553837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document +0.0003486179404943968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document +0.00034882846352857634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document +0.00031400868448352596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document +0.00030273484020011963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document +0.00029895889118145404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document +0.00029770764609621714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document +0.0002990181332116852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document +0.00029653733972285996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document +0.00029624649222942476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document +0.00029625609720203576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document +0.00029731928930852147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document +0.00029011721326148513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document +0.00028849788197494655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document +0.00021601278623858145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document +0.00021319599281739178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document +0.0002153325290600083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document +0.00018566946174516558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document +0.00020736824394291617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document +0.00020857419820128004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document +0.00020058526129536423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document +0.00020745812166665217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document +0.00020652171015271702 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document +0.00020643808911278608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document +0.00020040513914482103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document +0.00020598050188272898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document +0.0001969184139343296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document +0.0001972748812937012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document +0.0002038556751586195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document +0.00020245186011313464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document +0.00019950381422038783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document +0.00020837055459665258 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document +0.00020371856218246096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document +0.00019537612301625791 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document +0.00019914984508813857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document +0.0002053787713691309 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document +0.00019082100541008637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document +0.00020397153334531813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document +0.0002021462693077317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document +0.00019609357008124035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document +0.00019693256622486236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document +0.00020007239732428112 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document +0.00020467075741591954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document +0.00019584883400022932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document +0.00019135050391176972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document +0.0003362829834208298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document +0.00034013691154784095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document +0.00033215887031941976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document +0.00032681189065396707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document +0.0003149138485493094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document +0.00030179177307540077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document +0.0002923278437581119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document +0.00029470052278994486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document +0.0002994095093045731 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document +0.00029033525096085037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document +0.00029390798852496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document +0.0002916230924130842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document +0.00029419886374594913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document +0.0002865469756730764 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document +0.00021191292549942086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document +0.00021369664817409847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document +0.00021612485624266726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document +0.00022242192634588478 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document +0.00014605095659989698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document +0.00022070626106341693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document +0.0002174420774054071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document +0.00021325858963116995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document +0.0002124322999488052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document +0.0002081218896969054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document +0.0002108710211556957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document +0.00020686867095978426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document +0.00020895752681041895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document +0.00020741922266415738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document +0.0002069112657197308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document +0.00020644627473468118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document +0.00020332991338121604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document +0.0003560895677789848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document +0.00032915779111908214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document +0.00033810613317040864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document +0.00033729626594036923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document +0.00033550342864602944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document +0.00034173474024556906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document +0.000331505340748827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document +0.0003270050330117195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document +0.00032585275329172556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document +0.0003143383203190604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document +0.00031655199110388894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document +0.00030738872158476413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document +0.00030838388352699285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document +0.0003053596995351888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document +0.00031836304739584593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document +0.000315315435873905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document +0.0003087116248965243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document +0.00030396790625537645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document +0.0003335812246032149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document +0.00034570956323095843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document +0.00034563035636675786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document +0.00033411265479076335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document +0.00034439191141692787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document +0.0003364483125496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document +0.0003299500453608033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document +0.00033163377700074837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document +0.00032638649660627673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document +0.00032616167939645234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document +0.0003205289298760723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document +0.00031939393740815355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document +0.00031593164066731296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document +0.00031928871111254405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document +0.00029670189073175004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document +0.00020517703846735904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document +0.00020128418186172073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document +0.00019662723895606717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document +0.0001981157042081407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document +0.00019703489037041608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document +0.00019079796331785068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document +0.0001909352306690079 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document +0.00018824662295261396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document +0.00019864275319325954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document +0.00018818516521649587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document +0.00018875694972812844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document +0.00018231621170645482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document +0.00018349407845798273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document +0.00018088971427746906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document +0.00018296284236327237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document +0.0001876011825819916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document +0.000329052068725176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document +0.00032223616273648536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document +0.00031272564089633955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document +0.00031621609908414494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document +0.0003117213560911235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document +0.00030218064069945934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document +0.00030658916600512085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document +0.0002915863534115821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document +0.0002940280138374372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document +0.00029067860468866085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document +0.00028529228063135635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document +0.00028336893301452256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document +0.0002794668089130099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document +0.00021681361378827842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document +0.0001484664674497246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document +0.00021950558378215133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document +0.00021806860758808645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document +0.00021819568718852282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document +0.00021626925931585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document +0.0001464536143077762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document +0.00021432777088808917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document +0.000213473805865147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document +0.00021397067253964538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document +0.00020758957647437263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document +0.00020687124337683314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document +0.00020630057046511005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document +0.0002091166859352538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document +0.00020777355025615267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document +0.00020709287641496176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document +0.00020736464660577094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document +0.00020062246741862607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document +0.00020693207561942915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document +0.00021151004871893024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document +0.00019930249098689716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document +0.00021589710041231824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document +0.00021369204789905741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document +0.0002147099923936778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document +0.00021077531190389536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document +0.0002100509829113836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document +0.00021185362601571124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document +0.00020722136637339565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document +0.00020300093701169531 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document +0.00019859737993313477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document +0.00019971314372100164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document +0.00019549908270269278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document +0.00019649820843534028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document +0.00019619415513498067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document +0.00019493006120377898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document +0.00019499409035775506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document +0.00019252988593634277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document +0.00019440768268686405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document +0.00018747161324755577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document +0.0001879575932372779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document +0.00019040707058357506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document +0.0001871931095090703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document +0.00020112966223017096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document +0.00020516878165311017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document +0.00020664735191740533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document +0.00021041398572882962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document +0.00020397992929690396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document +0.0002039978580295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document +0.00020592785601142126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document +0.0001990755527445265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document +0.00019729564847798732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document +0.00019958182230527032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document +0.0001985037302636386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document +0.00020204130355115716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document +0.0002000296401958085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document +0.0001983064832295463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document +0.00019663108484195617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document +0.00019510678560556523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document +0.0001873284057063206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document +0.00019311553072495885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document +0.00034652137288816547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document +0.0002813690318850024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document +0.00027697649713138685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document +0.0002755419092534421 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document +0.0002681583054440219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document +0.00026945753192750824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document +0.00026169470768245737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document +0.00026437008960810825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document +0.0002637294838228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document +0.00026491867965088836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document +0.00025504483625138986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document +0.0002545040623796586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document +0.0002546682814073622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document +0.00025545439487142615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document +0.0002626896557978271 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document +0.00025092040940402784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document +0.0002589154885863872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document +0.00024106160482721467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document +0.0002483289690087987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document +0.0002388930282784437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document +0.00024006340759273874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document +0.00023765248178029045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document +0.00023061351965578936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document +0.00024954224883546477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document +0.00017861017233018525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document +0.00017810832743667658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document +0.00017599709170759497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document +0.00017462723516505223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document +0.0002906316527068669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document +0.00033762141066247166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document +0.00017170670574152494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document +0.00017258674515137717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document +0.0002815386173173926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document +0.0002996845935618989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document +0.0002735268488987296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document +0.0002971738713071517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document +0.0002942690674002763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document +0.0003322222207729567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document +0.0003378721656198464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document +0.00018307262621851067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document +0.00033956081502775057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document +0.00031604820927876276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document +0.00028805657681088917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document +0.00026312293321215633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document +0.00034366936722921455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document +0.0002865256504406559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document +0.0003063615195861786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document +0.00028412791619666136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document +0.00028060835132727154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document +0.00032544974761560506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document +0.0002647177833217225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document +0.0003152621884896575 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document +0.0003054625140336913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document +0.00031183308312292263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document +0.00018175026696621178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document +0.00017699918328872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document +0.00018222339261441908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document +0.00018348005930964137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document +0.0001810735993810541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document +0.00030846441282038914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document +0.0002972326889310354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document +0.00017433421318235594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document +0.00032799458649525895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document +0.00032482130048512673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document +0.00031943465668672475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document +0.00029615593630484517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document +0.0002893126939511001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document +0.0002849288351723284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document +0.00028383906633569267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document +0.00028072526091262615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document +0.000284239564292377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document +0.0002778903109432523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document +0.0002771644389501471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document +0.0002733316182319337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document +0.00026362539185869363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document +0.0002636325383220217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document +0.00026740622442302886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document +0.0002646771971853427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document +0.0002628566720605389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document +0.0002644760695434766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document +0.0002623837702310999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document +0.00026088722976772894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document +0.0002567065374799158 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document +0.00018857382101207726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document +0.00019036580399817203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document +0.00018348828065261222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document +0.00018491851780345073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document +0.00018904887260080187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document +0.0001875609304251801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document +0.00018393034720015817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document +0.00018419795526114903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document +0.00018699955623404795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document +0.00018276256902965128 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document +0.00017698045695190812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document +0.00018104650132303642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document +0.00017758206731279688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document +0.00017131402995103497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document +0.000175944428350446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document +0.0003416745727147391 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document +0.0003163259373952889 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document +0.0002804489269172448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document +0.00028748272397403175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document +0.00027603318345630605 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document +0.000271638824679648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document +0.0002763761210210942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document +0.00026501984873172717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document +0.00026422486894694714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document +0.0002686339100849262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document +0.0002610837453940606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document +0.000260974343729353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document +0.0002599403837029134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document +0.0002937273113238609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document +0.0003341790732600504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document +0.0002620661576600244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document +0.0003027929169239288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document +0.00031944039129326894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document +0.00019025676304139009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document +0.00018680910145009907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document +0.00034215840419416437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document +0.00018618120812119364 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document +0.00018605853095599425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document +0.00018120712626096538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document +0.00018315079292495327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document +0.00018362556449041974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document +0.0001780024456718171 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document +0.00033296526436178697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document +0.0001802398632282846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document +0.00017340263100798256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document +0.00017755840547238697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document +0.00018419413735260606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document +0.00017869518174591322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document +0.00017526271460129484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document +0.00017852168597981907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document +0.00017566536156787157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document +0.00017589867964432936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document +0.00017831487394075305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document +0.00017837310528935862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document +0.00018200908814216548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document +0.0001795136627511612 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document +0.0003414021775300033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document +0.00017177291787788502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document +0.0003441900648571877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document +0.0003394534597060673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document +0.0003236887233114832 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document +0.0001639544129688747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document +0.00019137443753211255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document +0.00018575146284680153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document +0.00019184792863440243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document +0.00018966043065679055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document +0.00017968851317035848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document +0.00018479881897661546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document +0.0001813642692683015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document +0.0001686449798983066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document +0.00018516104592230446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document +0.00031283726601066385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document +0.0003248607542883853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document +0.00031583241601202365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document +0.00031238270857730376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document +0.000307150592403979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document +0.00029443829986847044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document +0.0002942723732234677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document +0.00023514930666443422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document +0.0020776328951453444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document +0.0021768234410538883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document +0.002106973549276289 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document +0.002110915756171751 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document +0.0017032382109816464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document +0.0019047944877712286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document +0.0019402711744016077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document +0.0006264790011223686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document +0.0017885401938106643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document + diff --git a/ALCF/data-lists/sunspot/dolma_v1_7_file_list.txt b/ALCF/data-lists/sunspot/dolma_v1_7_file_list.txt new file mode 100644 index 0000000000..5d142522a7 --- /dev/null +++ b/ALCF/data-lists/sunspot/dolma_v1_7_file_list.txt @@ -0,0 +1,2419 @@ +0.0018520780893211373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document +0.0017591050606817512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document +0.001459052794333798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document +0.0007405667281569194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document +0.00019420030110896795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document +0.0009008668715801845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document +0.00015115827957143057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document +0.0014552844319220648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document +0.0012469861325685161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document +0.00136412011372413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document +0.0007064279699221103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document +0.0008472240000687427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document +0.0001984375713341955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document +0.0005472773881697123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document +0.001815779629850992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document +0.0018313600689757324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document +0.0002583902668716813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document +0.0002646575141232155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document +0.0003165521247456758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document +0.0002920706460176214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document +0.00028396813182810215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document +0.00030445161883108107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document +0.00031628781276576474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document +0.0003083776568189157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document +0.0003176359471472902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document +0.0002536009369131698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document +0.0003067491424681363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document +0.0002597217257557784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document +0.0003788556450109768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document +0.0002796563272052598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document +0.00033573826524290287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document +0.00030523658022800287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document +0.00032211552192240096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document +0.0003329295675164247 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document +0.0003101982186639862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document +0.00032361798234223355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document +0.0003495541581652915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document +0.0002821637448858042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document +0.00030399523537629673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document +0.0002955658968247219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document +0.00028942158502924254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document +0.00028769546171490733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document +0.0002938111057234182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document +0.0002711150403010948 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document +0.00031130095874747565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document +0.0003002996118160777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document +0.0003732757901604459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document +0.00026784205751795894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document +0.0002799626521661984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document +0.00034334276069078164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document +0.0003582469803674965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document +0.00031094844818418623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document +0.0002766228384977191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document +0.00030297116159471485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document +0.00027033888377464685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document +0.00030090862368377933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document +0.00028543875802490955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document +0.00027559768459074204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document +0.0003182185533962886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document +0.0003311392971435837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document +0.00028751652060804325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document +0.000303466863212589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document +0.00033400462801277524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document +0.0002589234031777426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document +0.0002913508598466723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document +0.0002670572450004856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document +0.00032027399105647656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document +0.00032188376258379377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document +0.0003161585784100882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document +0.0003184249182974135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document +0.00030381336664000807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document +0.0003190437442184283 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document +0.0002537961798200545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document +0.0003017817117223326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document +0.00028685268513240224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document +0.00031265179094451165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document +0.00034708319096986816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document +0.00026650837943080664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document +0.00034588832248507335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document +0.0002416982248399037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document +0.0003089296918222243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document +0.00029137184185700827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document +0.00026464226846800774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document +0.00030545397919456627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document +0.0003206778460448875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document +0.00030968971641110967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document +0.00023325653928600864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document +0.00030526899198338555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document +0.00035376719076633584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document +0.000290224385981026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document +0.000294650083382008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document +0.00028768858128616436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document +0.00030856965235527843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document +0.00030579942447879054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document +0.0002863101084704357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document +0.0002870032092492213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document +0.000264182727569885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document +0.0002974012367036449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document +0.00032238412143059203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document +0.00031683716893819036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document +0.00031157434937617524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document +0.0003411742735695989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document +0.00026778444816570715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document +0.0003037045797275201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document +0.00027746114370081314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document +0.00027148285946862043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document +0.00028042950114678207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document +0.0003235607816590721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document +0.0003086692227306295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document +0.00033990349455148105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document +0.00030945053208470265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document +0.00027309074552265303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document +0.00028737393506316194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document +0.0003098868328009879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document +0.0002614229162588409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document +0.0002884388407820923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document +0.0031025147279277244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document +0.003102019887362634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document +0.0009996745994661548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document +0.0002406272620255565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document +0.0002404825539493424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document +0.00024062296575435581 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document +0.00024069315766818953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document +0.00024055829162263452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document +0.00024062053397343032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document +0.0002410715545206964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document +0.00024024881846087368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document +0.0002407074700790688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document +0.00024072141428809043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document +0.00024027710230872736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document +0.0002409111299205489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document +0.00024081954058275009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document +0.00024086076794990912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document +0.00024098672620832446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document +0.00024068622303333862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document +0.00024140627024291824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document +0.0002414512033594384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document +0.00024028742594941463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document +0.00024018036089269645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document +0.0002398347365034979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document +0.00024006780153485276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document +0.00024015620270419213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document +0.0002408848259695227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document +0.0002408023185278831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document +0.00024021196580140326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document +0.00024077677271297493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document +0.00024087392454668027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document +0.0002408071293824126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document +0.00024042223828845715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document +0.0002411484752360495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document +0.00023605263746465907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document +0.00023471222158326908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document +0.00023432138580287644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document +0.00023407385623382327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document +0.00023487504174367091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document +0.0002341843704976313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document +0.00023421993170282486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document +0.00023445057969132037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document +0.0002337681680073047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document +0.000234627964808109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document +0.0002338942211888584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document +0.00023403849286843386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document +0.00023405641310796305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document +0.00023349169562397965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document +0.00023381157386048856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document +0.00023388742993790587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document +0.00023363103829469813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document +0.00023421141834630477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document +0.00023420564352232565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document +0.00023367463699173143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document +0.00023344969163567033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document +0.00023372196941547188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document +0.00023399207645297834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document +0.00023357915605505856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document +0.00023337585642190864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document +0.00023385005470157914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document +0.00023301533534493465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document +0.00023377864302541782 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document +0.00023323745848621437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document +0.0002330594611151835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document +0.0002334149675026783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document +0.00023198945902291534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document +0.00023023784834634142 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document +0.00022985623060187217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document +0.0002292605284569516 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document +0.00022926593333048894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document +0.00022922766406807777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document +0.00022898153911167426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document +0.0002292473111593315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document +0.000228804579400424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document +0.00022865485613513526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document +0.00022937426835887895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document +0.00022917388311587372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document +0.0002291660582019043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document +0.00022907895248360543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document +0.0002294617879920205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document +0.0002290452150516566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document +0.00022943405619715553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document +0.0002296271421006204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document +0.00022854791372910372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document +0.00022923123467686557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document +0.00022852404355738494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document +0.00022847798660086642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document +0.0002289604586810316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document +0.00022835479834950643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document +0.0002289149402884243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document +0.00022806655474763446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document +0.00022826296420992974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document +0.00022906829636213627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document +0.0002287628414466998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document +0.0002282673911253445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document +0.00022869309841939134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document +0.0002281540116815451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document +0.0002259755756162738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document +0.00022562331285233504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document +0.0002259061146106053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document +0.00022567670836663787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document +0.00022573165387587061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document +0.00022508514961670572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document +0.00022564642513773356 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document +0.00022563088621998788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document +0.0002250438755373707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document +0.00022524465346241134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document +0.00022531737657666812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document +0.00022444687519363458 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document +0.00022460397498596298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document +0.00022454218976501763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document +0.00022447528843671366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document +0.00022501666332178926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document +0.00022453752304377972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document +0.00022484451871163002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document +0.00022465678847154914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document +0.00022453180917044732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document +0.0002247278486823009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document +0.00022465794828242097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document +0.00022431000701925386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document +0.00022476020248460963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document +0.00022467531771795015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document +0.0002236391309945234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document +0.00022458764920536007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document +0.00022430877426744415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document +0.0002247047786127192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document +0.0002245298090400035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document +0.0002245648831396188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document +0.00022292894729820784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document +0.00022236668082957533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document +0.0002217622659895442 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document +0.00022252452726732609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document +0.00022135333211363678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document +0.0002214571757787971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document +0.0002217188139237798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document +0.00022144214894640303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document +0.00022100172806631854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document +0.00022156392409199052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document +0.00022134830143710272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document +0.00022158598922529453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document +0.00022142932483041377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document +0.00022120980907786554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document +0.00022117917738112441 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document +0.00022077089397851235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document +0.00022093265074996711 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document +0.00022091299741377004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document +0.0002205849150703338 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document +0.0002210648204787979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document +0.0002214235747364102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document +0.00022083907302221787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document +0.0002206334237915964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document +0.00022065193929912214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document +0.00022079775597767288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document +0.00022091492909963518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document +0.00022095009987097293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document +0.0002208150577180165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document +0.00022085759102772088 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document +0.00022073789170129016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document +0.00022049322781182384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document +0.00022083270617761285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document +0.00021982452827473632 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document +0.00021899870446514259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document +0.00021890358773356361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document +0.00021875556609042841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document +0.00021861195987201226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document +0.00021856782186167455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document +0.00021912837771543515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document +0.00021900213768517756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document +0.00021871675851390374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document +0.0002180537056545586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document +0.0002188196714327129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document +0.00021851362624523464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document +0.0002183236795498736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document +7.291153618675672e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document +0.0003742481815405742 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document +0.00038204855962733055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document +0.00038821818392663593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document +0.00038723332988783727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document +0.00038916141142149904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document +0.00038049542523949033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document +0.0003854755539534284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document +0.00024202756466512517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document +0.0003915405155008087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document +0.0003927382151931033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document +0.0003839151202260479 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document +0.00040006817468967907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document +0.00040318965964443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document +0.0003831013019452741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document +0.00039166638383204036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document +0.00039962784023961004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document +0.00039536707853602614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document +0.0004204304698247758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document +0.00041538899178693555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document +0.00039186953333675306 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document +0.00038945837196504305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document +0.0003919951238929062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document +0.00044377065718528966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document +0.0004407759068603017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document +0.0002487811895843715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document +0.00039349432045556636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document +0.00041223198559462343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document +0.0004036573014830213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document +0.0003825982215521807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document +0.00040386867133151386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document +0.00024460575279105167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document +0.000269029789531335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document +0.0003573757493252864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document +0.0004600876681392076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document +0.0002605354166397086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document +0.0003882502452157999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document +0.0002466747612126512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document +0.0004024726105072402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document +0.00040820631128483644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document +0.0002691094350403538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document +0.00026916830387277267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document +0.0004204663297880574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document +0.00042379698687085554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document +0.0004502169227311871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document +0.0002661708937015295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document +0.00031239486948031334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document +0.0003109054589936201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document +0.00045873053079760646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document +0.00022904931423244635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document +0.0003813462028433663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document +0.00039188129256500874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document +0.00045124222276983765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document +0.00048138658436853695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document +0.0003944178776279866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document +0.00039941569676754006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document +0.00037952761190240494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document +0.0003944870860881476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document +0.0003891842411856621 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document +0.000387688981934861 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document +0.00039197953876258005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document +0.00039007915280311206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document +0.0003995520363699188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document +0.00039230985654592406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document +0.0003929472067173851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document +0.0003924096172671473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document +0.0003881636143629905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document +0.000389790617937084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document +0.00037351762309221023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document +0.0003630196170929407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document +0.00033532465765142113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document +0.0003076088685761823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document +0.00039463850897720803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document +0.0002843816115231449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document +0.0002909175709416474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document +0.00028867170997202486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document +0.0002838644617723659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document +0.00029027869525543416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document +0.0002821339567560056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document +0.0002922988877045601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document +0.0002866955958315786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document +0.0002865271754558126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document +0.0002861247475618473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document +0.0002826681072408606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document +0.0002849746458282827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document +0.0002816966633435316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document +0.00026255342235948463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document +0.0002552895098829678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document +0.00025990194083107813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document +0.0002524062657685835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document +0.0002538577379748611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document +0.0002561415177406761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document +0.00026206253059694905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document +0.00026168095406910565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document +0.0002601305742008613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document +0.00025200823006814814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document +0.0003229951981263502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document +0.00037289448266476045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document +0.0003807825862179898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document +0.0003616333738191483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document +0.0003665117918907636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document +0.0003684186453633228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document +0.0003589330610806066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document +0.00036383861418030395 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document +0.000359841363355303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document +0.00036431044063050464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document +0.0003668574090358279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document +0.000362768263620199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document +0.0003501888032771077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document +0.000352401968221528 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document +0.0003541019701869794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document +0.0003628121865546891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document +0.0003752582953758773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document +0.00037902046230424966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document +0.0003777927146925147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document +0.0003760676130509053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document +0.00034046049078755405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document +0.0003338847563259091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document +0.00033294499102761794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document +0.0004912026198265864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document +0.00032064363474664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document +0.00032154190389541214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document +0.00032309660151746207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document +0.00031181143365304544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document +0.00031046092294569104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document +0.00031150165249068046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document +0.0003041314265988224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document +0.0003024834909739394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document +0.0003019936835833604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document +0.000292329665283177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document +0.0002867061143144972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document +0.00028443615610701707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document +0.00028462291013755945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document +0.0002793538601205013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document +0.00027306573977044246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document +0.00027097155673336525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document +0.0002752934202112985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document +0.00043042012694697647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document +0.00047495648822986177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document +0.00047755032493473855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document +0.0004706974343933747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document +0.00046682163297771817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document +0.0004616765425874178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document +0.00030644496751628097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document +0.0002909492555358308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document +0.00027272036068261724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document +0.0004101070217315588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document +0.0003728914338834357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document +0.00036546911442305647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document +0.0003669945482407483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document +0.0003715902407424017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document +0.00035837486406683366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document +0.0003573318538685469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document +0.0003553784893071916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document +0.0004920659809912352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document +0.0004533619411303183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document +0.00045067066057818706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document +0.00044396985139270645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document +0.00043198288204468477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document +0.00043005174223738454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document +0.00041847118430776784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document +0.00042952036375796664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document +0.00043420594647324267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document +0.0003461123241053012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document +0.0003408581597849182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document +0.00033172705422182547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document +0.0003392566490686136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document +0.00033578341518385483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document +0.0003439196710518844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document +0.00034559163447085543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document +0.00033762478642902825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document +0.00033215210055107224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document +0.00033423579608014966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document +0.0004963355016025102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document +0.0004996862761456923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document +0.0005000551829325451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document +0.0005004212610098755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document +0.00027768695585500585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document +0.00028395983854338433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document +0.00027835826303062254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document +0.0002740073176010804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document +0.0002791830529274016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document +0.0002796863816194411 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document +0.00026697453022672804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document +0.0002594197440280141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document +0.0003779565697649222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document +0.00041835823476586606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document +0.00043788493575265915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document +0.0002731731970096006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document +0.000276305847423402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document +0.0002704955773958623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document +0.0002629635944827518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document +0.000260070956974436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document +0.00025661553791456334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document +0.00025794727207576157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document +0.00025295733980001527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document +0.0003788106407021029 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document +0.0004882344027669431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document +0.0003275324309642705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document +0.0004803401856640094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document +0.00046720138323433943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document +0.00043527810307095335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document +0.00043905395741627827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document +0.00048774175867331425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document +0.00048380704121346737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document +0.0004779011848346118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document +0.00046255587581908036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document +0.00045127922880511576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document +0.0004503891485256095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document +0.0004450142332303422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document +0.00044630282482516654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document +0.00044325014465743616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document +0.0004263874842796447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document +0.0004217530913646938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document +0.000415120314341852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document +0.00040987168279144537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document +0.00033468337266607834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document +0.0003353094464683005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document +0.0004833936821707294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document +0.00047194878988920935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document +0.0004648324126996427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document +0.0004562345003964941 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document +0.0004933203505465098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document +0.0003530166075325466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document +0.00035368548192804685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document +0.0004872620828289663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document +0.00048293889392426456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document +0.00047936768462267655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document +0.00047821013991587545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document +0.0004660610308564753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document +0.000394683430103437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document +0.00039165053441571324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document +0.0003906936040164381 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document +0.00038074803919159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document +0.0003686529291578143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document +0.00035832920428870976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document +0.00035929024535947033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document +0.0003538226556050544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document +0.0003584167868708799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document +0.0003480507542594234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document +0.0003413709023543034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document +0.00034001304759361455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document +0.00033430532902756514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document +0.00046519252660631277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document +0.0002938876402514769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document +0.00028676090994509047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document +0.00027296150117506716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document +0.00026513502621960483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document +0.0002680081327926125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document +0.00025831225828720344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document +0.00026647037295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document +0.0002525733734572654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document +0.00025831708887575375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document +0.00042487627444443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document +0.0004951213245023891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document +0.0004804051413177752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document +0.0004662397611340532 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document +0.0004550138655253933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document +0.00044494909122746795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document +0.0002899112253051385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document +0.0004372879736279761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document +0.0004529568099252922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document +0.00045127826158829573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document +0.0004436558176737439 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document +0.0004419233237678378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document +0.000434589215880319 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document +0.00029153613207706566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document +0.0004312458058738854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document +0.00028741854968757313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document +0.00046853200754421234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document +0.0004949145252030074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document +0.00044459683920483167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document +0.0003836095306696336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document +0.0003789760237872398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document +0.0003749227438304427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document +0.0003628558277173369 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document +0.00039468301394041474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document +0.00038874701821614864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document +0.0004158492456077867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document +0.00042360504554060077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document +0.00040386729844317623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document +0.00027595096702902474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document +0.00043638766787829135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document +0.0002218691596850179 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document +0.0004437566108089954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document +0.0003889996411609667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document +0.00043454421906537704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document +0.0004522564392830988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document +0.00041517835659357416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document +0.0002614360863446896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document +0.00037543522111463596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document +0.0004386190133514781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document +0.00046358333286115075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document +0.00043186261317942404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document +0.0002377581602097957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document +0.00025973334085074254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document +0.00040139099332000796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document +0.00043674860686687174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document +0.00040853289309329373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document +0.000242910191729688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document +0.0004431071731750582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document +0.0004388092670482523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document +0.000381418866255965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document +0.0004100117296419717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document +0.00042469230366022745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document +0.00041744151905374254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document +0.00022835699906752945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document +0.0004380161085387397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document +0.00044803212381807456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document +0.00040554932796137236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document +0.0004234508646347761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document +0.00043341209652360653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document +0.00023966604734537185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document +0.000259165907316014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document +0.0004270653021833602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document +0.0004341547032162028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document +0.0004111478117275994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document +0.0004299383567984396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document +0.0004241899124590779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document +0.0004502719349364145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document +0.00038994621469645615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document +0.0003859912398894952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document +0.0004247535950310557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document +0.000386982084327716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document +0.0004196451040053251 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document +0.0004096278509782259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document +0.0004373334932695721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document +0.0004180889975240641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document +0.00042079636929672745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document +0.00038063574611812913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document +0.0003817505891515542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document +0.0004420096268860222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document +0.00039182670726410623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document +0.0003635667850372299 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document +0.00041564996472055667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document +0.000400529358757286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document +0.0003939113874958451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document +0.00039066622068940996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document +0.0004290098538807143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document +0.0004240739958197099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document +0.00040775392659215333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document +0.0004091634200396925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document +0.00042299190476617914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document +0.0003701492680344151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document +0.0003807353844384635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document +0.00038813507771983156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document +0.00040072346558408346 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document +0.0003603595180423597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document +0.00038799421353112465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document +0.00037575235582264926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document +0.0004239190342959713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document +0.0004606044799136546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document +0.00045107950652529253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document +0.0004391947201871058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document +0.0004457516661123035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document +0.0004301297170991686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document +0.00044661704164586694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document +0.0004438849846114837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document +0.0004444205734316823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document +0.0004190924165303394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document +0.00043942581131677875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document +0.00021568459798090663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document +0.0003814929225407199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document +0.0003217453179359235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document +0.00031719591470267974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document +0.00032434115726922137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document +0.0004079911120371051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document +0.000329492766381148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document +0.0003845916162001633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document +0.0003835208964390098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document +0.00037847334157173194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document +0.00038296039903791865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document +0.00037896336828472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document +0.00037620974396391355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document +0.00037420590727111843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document +0.000340490625886403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document +0.0003078314411035827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document +0.00034153990750656097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document +0.0003308858103982067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document +0.0003452640607156025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document +0.00033095276418403455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document +0.0003116308995860414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document +0.00032446713226408477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document +0.0003015816821912984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document +0.00031612418775706894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document +0.0003278516344971041 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document +0.00033079446736097217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document +0.00032278977146550837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document +0.00032065272988207914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document +0.0003936696452406576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document +0.0003450109536627789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document +0.0003339787189919641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document +0.0003284303856176974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document +0.00033652677276843477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document +0.0003257822443845694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document +0.0003293985569149334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document +0.0003310360260148262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document +0.0003233770986418526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document +0.0003172280092149422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document +0.0003160674744292835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document +0.00030931090289598506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document +0.0003093173886443107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document +0.00033167847081104083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document +0.00031131501311729723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document +0.00031046608876279845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document +0.00030569235942207244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document +0.00030777943671285197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document +0.00029303314290956683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document +0.0003045824546400205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document +0.00030360880677729793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document +0.00031646239964835433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document +0.0003129122300603785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document +0.00031060464956661433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document +0.000311819032500067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document +0.0002977872483902282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document +0.0003009448600922438 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document +0.00028610292098537774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document +0.0002988326876216654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document +0.00028550828372819075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document +0.0002830381750875739 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document +0.0002848495855927156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document +0.0002856443760308144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document +0.00027442895344188584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document +0.0002681160554049462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document +0.0003421482544126989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document +0.0004005872948449718 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document +0.0003930123959320308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document +0.0003867271832275778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document +0.000380805140455254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document +0.0003814769861947819 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document +0.00038025170883282324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document +0.0003738026647867475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document +0.00018960856915036276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document +0.0003697177501953134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document +0.00036674194328136693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document +0.00036447406838697555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document +0.00036686410861101255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document +0.00035915267825103423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document +0.0003624758404026675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document +0.0002822812140180794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document +0.00030620512946920813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document +0.000294249776520589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document +0.00030238536967523434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document +0.00029509593361580754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document +0.0002906912701830899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document +0.0002921944165474959 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document +0.00028358919691127954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document +0.0002813182772323272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document +0.00027442640800299205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document +0.0002747820342933984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document +0.0002747584403979717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document +0.00027499129634862444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document +0.0002712050404257197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document +0.0002616256943143254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document +0.00026769938929002815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document +0.00038396081322727017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document +0.0003863140490027991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document +0.00037702277513203237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document +0.0003633274156107032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document +0.0003587473889240435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document +0.0003507672084278415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document +0.00033776425499780385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document +0.0003377914127574796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document +0.00032948015659161326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document +0.00033245638541392985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document +0.00031080707640648695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document +0.0002976903331149755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document +0.0002965121463725523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document +0.0002933849695266647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document +0.0002837035078508233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document +0.00028684569079589323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document +0.0003145192320802359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document +0.0003566937253273515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document +0.0003470199109592918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document +0.0003060245312041868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document +0.0002650817213818789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document +0.0002643604938780134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document +0.000299350876031416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document +0.0003178540797697938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document +0.000271850367887767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document +0.00031349896596549 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document +0.00031749734412765755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document +0.0003791137842391209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document +0.0003742334169957992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document +0.0003705639757351107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document +0.0003126986769797042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document +0.00031038132814561196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document +0.00036464437173804883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document +0.0003569480488951322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document +0.0003541239221619106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document +0.00035315297411308053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document +0.0003572451925404141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document +0.0003514986129411253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document +0.0003521798298425866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document +0.00034553677439244716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document +0.000349004719809412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document +0.0003468247484872769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document +0.0003465822608356558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document +0.00035410983132162007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document +0.0003487908354969444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document +0.0003479024763238147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document +0.000341412530646823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document +0.00034451316273667034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document +0.0002618849993484869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document +0.00026788679978901144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document +0.00027450670773227214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document +0.0002661273129899329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document +0.00026836569676402957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document +0.00026155876975483236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document +0.0002609276830117151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document +0.0002644161630512771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document +0.00036789208972872557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document +0.00037829849439990513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document +0.0003788894943523098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document +0.0003617207777959397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document +0.0002541334487248998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document +0.0002707945538071073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document +0.00027046282716455214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document +0.0002652443167243215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document +0.0002685859923850986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document +0.00025734961751176414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document +0.000259041720872915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document +0.00025340107274823446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document +0.00025757135121837893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document +0.00025617700500574084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document +0.0002566931670562857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document +0.0002543871190716101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document +0.00024997565589481713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document +0.0002954079779456287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document +0.00034890741135252835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document +0.0003473298137731525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document +0.0003296959618486435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document +0.0003304520061604598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document +0.00032377956175729824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document +0.00031700696295168713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document +0.0003060382346081943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document +0.0003012003005056863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document +0.0002981074073993884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document +0.0002922128825950705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document +0.000348901087722931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document +0.0003408286289467841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document +0.0003410649680770183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document +0.0003358524215576502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document +0.0003343661874989231 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document +0.00032810573699389156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document +0.00032261449539097497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document +0.0003162694866049203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document +0.0003158381156468853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document +0.000317376061083603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document +0.0003125788639953052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document +0.0003010105041885602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document +0.0003065865059090678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document +0.0003084275726508053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document +0.00030966560718296085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document +0.0002957728057853081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document +0.00029904164542325336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document +0.0002955358888729187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document +0.00028692976446931544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document +0.0002923476214935797 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document +0.0002893691697212419 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document +0.0002855895211981585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document +0.00027968347097626246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document +0.0002810783462604979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document +0.00027794080455729715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document +0.00034784376461416953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document +0.0003488347959010943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document +0.00034790583710250724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document +0.000345913166618151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document +0.00033801936268066675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document +0.0003290591130212315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document +0.00034051399521366823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document +0.00032470943131841784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document +0.00031679540050914276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document +0.00031814596342422325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document +0.0003156466289485036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document +0.00029985010879003633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document +0.0002905176377776361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document +0.0004206836775460856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document +0.00020660449162246918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document +0.0003461727254468087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document +0.00020592870907067763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document +0.00034173505299233005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document +0.0004052437256652738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document +0.0004080650901351697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document +0.00039778184149144276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document +0.00039046311464950275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document +0.00039043444911071384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document +0.000388575704932843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document +0.00019737533145666597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document +0.00037610755595812403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document +0.00037315400127598317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document +0.00037415028580922163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document +0.00036694041707212337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document +0.00018947219857306515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document +0.00037046050826533545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document +0.0003587440768559087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document +0.00034623936498708903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document +0.0003502289592617922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document +0.00034692398063649823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document +0.000339340809421849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document +0.0003360510394816983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document +0.0003354673850814145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document +0.00032937682875877047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document +0.00032844505049317715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document +0.00028287199339908627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document +0.0002795217197003578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document +0.00028048955601883463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document +0.0002769326396439027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document +0.0002727090021299243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document +0.0002726577841024554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document +0.00026663619593455374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document +0.00026068042672138127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document +0.0002637704114326801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document +0.0002593043567100412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document +0.0002599897110113453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document +0.0002435078682758859 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document +0.0002450530071379054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document +0.00024233331983743606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document +0.0002934750947999535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document +0.00033241226364044474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document +0.00032938406090272075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document +0.00032778705403953246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document +0.00032184551480398754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document +0.00031874002264945737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document +0.0003165319685666433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document +0.00031307071173376295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document +0.00031119524184911957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document +0.0003102253344576429 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document +0.0003088976240383192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document +0.0002951410823077708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document +0.00029772657676757413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document +0.0003056048989909935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document +0.00031991305381648026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document +0.00030890256978362426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document +0.0003109382904091933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document +0.00031035798529690644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document +0.00030741666395911753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document +0.0002989918594861846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document +0.00029569635443989434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document +0.0002973992445667285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document +0.000293397351001072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document +0.00028737817438047954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document +0.00028252738144009747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document +0.0002805511898623541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document +0.0003718020784620472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document +0.0003499713845765235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document +0.00034283547445326676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document +0.00031464759888838765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document +0.00033188946446414833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document +0.000326084432195463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document +0.0003764568303917893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document +0.0003604955598858414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document +0.0003655654554133222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document +0.00035762304033750504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document +0.00038478883950347103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document +0.00027735714341247454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document +0.00028139534607773563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document +0.00019777292251713763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document +0.000285571704874486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document +0.00028543482146244363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document +0.00019434234484256758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document +0.00027854908176986763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document +0.0002847068039566143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document +0.00028672356943064853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document +0.00027782687605808177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document +0.0002843539634105203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document +0.0002894748379090401 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document +0.0002868852440186493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document +0.0002818504885373851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document +0.00028680112812941034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document +0.00019258978168723977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document +0.00028760637934715155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document +0.0002820439443912918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document +0.0002831001054410018 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document +0.00029001901552467397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document +0.00027779449377883156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document +0.00019949837437516796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document +0.0002907306472984446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document +0.00027814858381318327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document +0.00019472790889161432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document +0.00020472626596924125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document +0.0002870045081974301 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document +0.00019812241927078482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document +0.0002817553333369554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document +0.00027829782796642117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document +0.00028289431732284113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document +0.0002795526296717729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document +0.00027682829988044574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document +0.0002895432402719184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document +0.0002823174903941811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document +0.00028170972351837796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document +0.00027807915877838826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document +0.00028588515681452956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document +0.00028112324090816726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document +0.00020636178289985485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document +0.00019447255290980535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document +0.0002850824220591452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document +0.00027856429520116784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document +0.0002820880676635633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document +0.00028943902215995714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document +0.0002676366291085329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document +0.00023806333809954687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document +0.00024526460430233455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document +0.00023876876664622726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document +0.00023379770334179805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document +0.00024175151269138382 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document +0.00023386583242595706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document +0.00023771797150160827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document +0.0002262748967483896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document +0.0002408148346432682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document +0.00023398651720444235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document +0.00022989433874474592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document +0.00023948500543957772 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document +0.0002331594076859196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document +0.00023375132439600242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document +0.00023923410909668642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document +0.00023952796315562954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document +0.0002327466076905069 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document +0.00023082758956797212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document +0.0002240509275524448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document +0.00022798879995765268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document +0.000221172516774386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document +0.00021767045123534623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document +0.00021982832794804484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document +0.00021971626543789102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document +0.00022566565206920132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document +0.0002181984894194856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document +0.00021831417549554653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document +0.00021601405421187145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document +0.00022275733725519607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document +0.00021847734911973986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document +0.0002243591012664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document +0.00021688758139483833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document +0.0002182953624789215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document +0.00020475155724026002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document +0.00021498078062960065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document +0.0002157914337233064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document +0.00021781838494967963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document +0.00021723242266814558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document +0.0002176782686553837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document +0.0003486179404943968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document +0.00034882846352857634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document +0.00031400868448352596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document +0.00030273484020011963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document +0.00029895889118145404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document +0.00029770764609621714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document +0.0002990181332116852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document +0.00029653733972285996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document +0.00029624649222942476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document +0.00029625609720203576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document +0.00029731928930852147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document +0.00029011721326148513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document +0.00028849788197494655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document +0.00021601278623858145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document +0.00021319599281739178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document +0.0002153325290600083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document +0.00018566946174516558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document +0.00020736824394291617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document +0.00020857419820128004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document +0.00020058526129536423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document +0.00020745812166665217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document +0.00020652171015271702 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document +0.00020643808911278608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document +0.00020040513914482103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document +0.00020598050188272898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document +0.0001969184139343296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document +0.0001972748812937012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document +0.0002038556751586195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document +0.00020245186011313464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document +0.00019950381422038783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document +0.00020837055459665258 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document +0.00020371856218246096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document +0.00019537612301625791 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document +0.00019914984508813857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document +0.0002053787713691309 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document +0.00019082100541008637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document +0.00020397153334531813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document +0.0002021462693077317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document +0.00019609357008124035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document +0.00019693256622486236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document +0.00020007239732428112 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document +0.00020467075741591954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document +0.00019584883400022932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document +0.00019135050391176972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document +0.0003362829834208298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document +0.00034013691154784095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document +0.00033215887031941976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document +0.00032681189065396707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document +0.0003149138485493094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document +0.00030179177307540077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document +0.0002923278437581119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document +0.00029470052278994486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document +0.0002994095093045731 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document +0.00029033525096085037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document +0.00029390798852496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document +0.0002916230924130842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document +0.00029419886374594913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document +0.0002865469756730764 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document +0.00021191292549942086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document +0.00021369664817409847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document +0.00021612485624266726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document +0.00022242192634588478 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document +0.00014605095659989698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document +0.00022070626106341693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document +0.0002174420774054071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document +0.00021325858963116995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document +0.0002124322999488052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document +0.0002081218896969054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document +0.0002108710211556957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document +0.00020686867095978426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document +0.00020895752681041895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document +0.00020741922266415738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document +0.0002069112657197308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document +0.00020644627473468118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document +0.00020332991338121604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document +0.0003560895677789848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document +0.00032915779111908214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document +0.00033810613317040864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document +0.00033729626594036923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document +0.00033550342864602944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document +0.00034173474024556906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document +0.000331505340748827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document +0.0003270050330117195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document +0.00032585275329172556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document +0.0003143383203190604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document +0.00031655199110388894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document +0.00030738872158476413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document +0.00030838388352699285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document +0.0003053596995351888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document +0.00031836304739584593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document +0.000315315435873905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document +0.0003087116248965243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document +0.00030396790625537645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document +0.0003335812246032149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document +0.00034570956323095843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document +0.00034563035636675786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document +0.00033411265479076335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document +0.00034439191141692787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document +0.0003364483125496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document +0.0003299500453608033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document +0.00033163377700074837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document +0.00032638649660627673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document +0.00032616167939645234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document +0.0003205289298760723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document +0.00031939393740815355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document +0.00031593164066731296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document +0.00031928871111254405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document +0.00029670189073175004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document +0.00020517703846735904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document +0.00020128418186172073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document +0.00019662723895606717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document +0.0001981157042081407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document +0.00019703489037041608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document +0.00019079796331785068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document +0.0001909352306690079 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document +0.00018824662295261396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document +0.00019864275319325954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document +0.00018818516521649587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document +0.00018875694972812844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document +0.00018231621170645482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document +0.00018349407845798273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document +0.00018088971427746906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document +0.00018296284236327237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document +0.0001876011825819916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document +0.000329052068725176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document +0.00032223616273648536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document +0.00031272564089633955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document +0.00031621609908414494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document +0.0003117213560911235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document +0.00030218064069945934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document +0.00030658916600512085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document +0.0002915863534115821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document +0.0002940280138374372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document +0.00029067860468866085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document +0.00028529228063135635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document +0.00028336893301452256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document +0.0002794668089130099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document +0.00021681361378827842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document +0.0001484664674497246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document +0.00021950558378215133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document +0.00021806860758808645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document +0.00021819568718852282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document +0.00021626925931585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document +0.0001464536143077762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document +0.00021432777088808917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document +0.000213473805865147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document +0.00021397067253964538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document +0.00020758957647437263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document +0.00020687124337683314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document +0.00020630057046511005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document +0.0002091166859352538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document +0.00020777355025615267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document +0.00020709287641496176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document +0.00020736464660577094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document +0.00020062246741862607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document +0.00020693207561942915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document +0.00021151004871893024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document +0.00019930249098689716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document +0.00021589710041231824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document +0.00021369204789905741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document +0.0002147099923936778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document +0.00021077531190389536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document +0.0002100509829113836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document +0.00021185362601571124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document +0.00020722136637339565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document +0.00020300093701169531 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document +0.00019859737993313477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document +0.00019971314372100164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document +0.00019549908270269278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document +0.00019649820843534028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document +0.00019619415513498067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document +0.00019493006120377898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document +0.00019499409035775506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document +0.00019252988593634277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document +0.00019440768268686405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document +0.00018747161324755577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document +0.0001879575932372779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document +0.00019040707058357506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document +0.0001871931095090703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document +0.00020112966223017096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document +0.00020516878165311017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document +0.00020664735191740533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document +0.00021041398572882962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document +0.00020397992929690396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document +0.0002039978580295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document +0.00020592785601142126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document +0.0001990755527445265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document +0.00019729564847798732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document +0.00019958182230527032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document +0.0001985037302636386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document +0.00020204130355115716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document +0.0002000296401958085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document +0.0001983064832295463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document +0.00019663108484195617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document +0.00019510678560556523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document +0.0001873284057063206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document +0.00019311553072495885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document +0.00034652137288816547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document +0.0002813690318850024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document +0.00027697649713138685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document +0.0002755419092534421 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document +0.0002681583054440219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document +0.00026945753192750824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document +0.00026169470768245737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document +0.00026437008960810825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document +0.0002637294838228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document +0.00026491867965088836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document +0.00025504483625138986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document +0.0002545040623796586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document +0.0002546682814073622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document +0.00025545439487142615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document +0.0002626896557978271 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document +0.00025092040940402784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document +0.0002589154885863872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document +0.00024106160482721467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document +0.0002483289690087987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document +0.0002388930282784437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document +0.00024006340759273874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document +0.00023765248178029045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document +0.00023061351965578936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document +0.00024954224883546477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document +0.00017861017233018525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document +0.00017810832743667658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document +0.00017599709170759497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document +0.00017462723516505223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document +0.0002906316527068669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document +0.00033762141066247166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document +0.00017170670574152494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document +0.00017258674515137717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document +0.0002815386173173926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document +0.0002996845935618989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document +0.0002735268488987296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document +0.0002971738713071517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document +0.0002942690674002763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document +0.0003322222207729567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document +0.0003378721656198464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document +0.00018307262621851067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document +0.00033956081502775057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document +0.00031604820927876276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document +0.00028805657681088917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document +0.00026312293321215633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document +0.00034366936722921455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document +0.0002865256504406559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document +0.0003063615195861786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document +0.00028412791619666136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document +0.00028060835132727154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document +0.00032544974761560506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document +0.0002647177833217225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document +0.0003152621884896575 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document +0.0003054625140336913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document +0.00031183308312292263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document +0.00018175026696621178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document +0.00017699918328872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document +0.00018222339261441908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document +0.00018348005930964137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document +0.0001810735993810541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document +0.00030846441282038914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document +0.0002972326889310354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document +0.00017433421318235594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document +0.00032799458649525895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document +0.00032482130048512673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document +0.00031943465668672475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document +0.00029615593630484517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document +0.0002893126939511001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document +0.0002849288351723284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document +0.00028383906633569267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document +0.00028072526091262615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document +0.000284239564292377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document +0.0002778903109432523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document +0.0002771644389501471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document +0.0002733316182319337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document +0.00026362539185869363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document +0.0002636325383220217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document +0.00026740622442302886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document +0.0002646771971853427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document +0.0002628566720605389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document +0.0002644760695434766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document +0.0002623837702310999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document +0.00026088722976772894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document +0.0002567065374799158 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document +0.00018857382101207726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document +0.00019036580399817203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document +0.00018348828065261222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document +0.00018491851780345073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document +0.00018904887260080187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document +0.0001875609304251801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document +0.00018393034720015817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document +0.00018419795526114903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document +0.00018699955623404795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document +0.00018276256902965128 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document +0.00017698045695190812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document +0.00018104650132303642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document +0.00017758206731279688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document +0.00017131402995103497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document +0.000175944428350446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document +0.0003416745727147391 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document +0.0003163259373952889 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document +0.0002804489269172448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document +0.00028748272397403175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document +0.00027603318345630605 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document +0.000271638824679648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document +0.0002763761210210942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document +0.00026501984873172717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document +0.00026422486894694714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document +0.0002686339100849262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document +0.0002610837453940606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document +0.000260974343729353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document +0.0002599403837029134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document +0.0002937273113238609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document +0.0003341790732600504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document +0.0002620661576600244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document +0.0003027929169239288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document +0.00031944039129326894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document +0.00019025676304139009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document +0.00018680910145009907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document +0.00034215840419416437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document +0.00018618120812119364 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document +0.00018605853095599425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document +0.00018120712626096538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document +0.00018315079292495327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document +0.00018362556449041974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document +0.0001780024456718171 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document +0.00033296526436178697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document +0.0001802398632282846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document +0.00017340263100798256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document +0.00017755840547238697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document +0.00018419413735260606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document +0.00017869518174591322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document +0.00017526271460129484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document +0.00017852168597981907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document +0.00017566536156787157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document +0.00017589867964432936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document +0.00017831487394075305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document +0.00017837310528935862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document +0.00018200908814216548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document +0.0001795136627511612 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document +0.0003414021775300033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document +0.00017177291787788502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document +0.0003441900648571877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document +0.0003394534597060673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document +0.0003236887233114832 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document +0.0001639544129688747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document +0.00019137443753211255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document +0.00018575146284680153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document +0.00019184792863440243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document +0.00018966043065679055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document +0.00017968851317035848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document +0.00018479881897661546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document +0.0001813642692683015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document +0.0001686449798983066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document +0.00018516104592230446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document +0.00031283726601066385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document +0.0003248607542883853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document +0.00031583241601202365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document +0.00031238270857730376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document +0.000307150592403979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document +0.00029443829986847044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document +0.0002942723732234677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document +0.00023514930666443422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document +0.0020776328951453444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document +0.0021768234410538883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document +0.002106973549276289 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document +0.002110915756171751 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document +0.0017032382109816464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document +0.0019047944877712286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document +0.0019402711744016077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document +0.0006264790011223686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document +0.0017885401938106643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document +0.0003547982093445404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document +0.00035934014428504944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document +0.00035707704501371544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document +0.00035287930712815354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document +0.00035977166728996823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document +0.0003581675664109838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document +0.0003548617059697185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document +0.0003639582000286208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document +0.00035375839698688127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document +0.0003743722020080678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document +0.0003530399715341242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document +0.00035511875882752406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document +0.0003618733574783154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document +0.00035185243285420104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document +0.0003541503739732106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document +0.0003631679485751914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document +0.00035748045578182274 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document +0.0003606490690555877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document +0.0003626383296610091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document +0.00035442644361264756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document +0.00035978370170539796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document +0.0003585562375341541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document +0.0003601958372888019 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document +0.000350277765402227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document +0.0003616521184211704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document +0.0003620625543608188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document +0.0003560781983850704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document +0.0003553209610592676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document +0.00035905348643915075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document +0.00034744258805696526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document +0.00035462784035661496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document +0.00034768186175100895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document +0.0003568534635532736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document +0.00035586511544371234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document +0.0003524567827568137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document +0.0003512453770426313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document +0.0003591792726468799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document +0.0003514024529343127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document +0.0003584880112586934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document +0.00035133552916418045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document +0.0003600811981350215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document +0.0003571663974228119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document +0.00035768103378874214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document +0.00035939205561113694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document +0.00035186773916029825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document +0.0003542829672490847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document +0.0003592783642898726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document +0.0003556367340099302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document +0.00035391392271377027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document +0.00035486725707484836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document +0.00034866743396828035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document +0.0003517219808644735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document +0.00034874458549673823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document +0.000355773136961014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document +0.00035611750387841917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document +0.00035305602013916315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document +0.0003578207127071924 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document +0.00035514635841943707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document +0.00034816946212866206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document +0.0003512707269761496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document +0.0003483392117980654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document +0.0003572169607204321 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document +0.00035139153281660794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document +0.00035536422129036537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document +0.000352017164107143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document +0.000351889550179365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document +0.000358759689953589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document +0.0003569286079869268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document +0.0003657752958602099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document +0.00035396127934790697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document +0.0003618565071224743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document +0.00035146051531973204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document +0.00036107135765783567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document +0.00035019554279994576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document +0.00035567858879904983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document +0.0003504753174793183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document +0.00035931140831329194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document +0.0003502967866002823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document +0.0003532911801041972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document +0.0003583543013070199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document +0.0003566243489931224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document +0.0003468752314799221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document +0.0003597840618138091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document +0.00035128822484768084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document +0.00035889496943437507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document +0.000352400524650424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document +0.0003518689536768735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document +0.00035866864741303467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document +0.0003454687659106334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document +0.00035348007259317576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document +0.0003539752270940644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document +0.00035146495994081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document +0.00035397212846310423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document +0.00035208246467162587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document +0.0003490843168676626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document +0.00035299633658644394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document +0.00034868327466167065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document +0.00035941351365601583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document +0.0003545343062735255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document +0.0003528956380445978 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document +0.0003553355770443352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document +0.0003644224004937743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document +0.00035234291036216907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document +0.0003596237469847771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document +0.0003531996065735989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document +0.0003547177054106099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document +0.0003575586499260483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document +0.00035262635135283667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document +0.0003624191962188944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document +0.0003488398052948616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document +0.0003598294093147917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document +0.00035583006534466323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document +0.00035403139653225103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document +0.00036134702642187156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document +0.0003573689927162834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document +0.0003577141131435527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document +0.00035208814419277406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document +0.00035996720683665625 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document +0.00035415304658912596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document +0.00036353353029443546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document +0.0003537326003150983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document +0.00036053976358299083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document +0.000352380489373494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document +0.00036154661616900994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document +0.00035959332325963614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document +0.0003597954667189692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document +0.0003563108270597542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document +0.0003582891940460143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document +0.0003497728210484297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document +0.0003549834902179354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document +0.0003529828233484542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document +0.00034627483903285777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document +0.00035569006572589215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document +0.00035449377946910314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document +0.00035802844396194623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document +0.0003617277809353208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document +0.00035034118898654814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document +0.000351091193908611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document +0.0003527914342210668 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document +0.00035028288369781376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document +0.00035775745592780506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document +0.0003449630690661468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document +0.0003583490698830361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document +0.0003476995746684122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document +0.0003535632505019212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document +0.00035640180641147417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document +0.000361731045691765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document +0.0003534082129597368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document +0.0003550344149828664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document +0.00035363002411364057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document +0.0003537265579677396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document +0.00034950531383577937 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document +0.00035008511827347514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document +0.00035594533400871325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document +0.00035266312861335946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document +0.00035280268794863923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document +0.0003565470391528536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document +0.0003588492322689137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document +0.00035469909697832775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document +0.00034712082813410526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document +0.000348701157101807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document +0.0003500192014479944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document +0.00035120560544669755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document +0.00035403656850437445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document +0.00035852376560749366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document +0.0003534754068111774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document +0.00035591740046720765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document +0.000348522354782563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document +0.0003533533959664415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document +0.00035631425964030697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document +0.0003485886551574741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document +0.00035917652631065777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document +0.0003482975272111288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document +0.00035580661277480167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document +0.0003492290722955348 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document +0.00034989284450240613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document +0.0003545677216162781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document +0.00034622286859463484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document +0.00036070626989861965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document +0.00035518365036320786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document +0.00035272907057848406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document +0.0003547343638218734 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document +0.0003496450144966242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document +0.0003537407829294287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document +0.0003489722653985685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document +0.00035057186899911295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document +0.0003507566548933051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document +0.00035630360179023747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document +0.00035631362503416367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document +0.0003490204248026821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document +0.00035761724058371226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document +0.00035037664777467137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document +0.000353402110481068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document +0.00034524163568371745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document +0.00035528523728570974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document +0.00034784916132431703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document +0.00034928476408048925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document +0.00034989205973784984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document +0.00034201664404094254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document +0.0003529676016338611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document +0.00034643433682346637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document +0.0003511666373001904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document +0.00034828669066575333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document +0.0003494625207264413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document +0.0003458957535879216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document +0.0003543020478990003 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document +0.00034754384069014956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document +0.0003598856392240133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document +0.0003503335458553846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document +0.00035919595619778716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document +0.00035767737970754404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document +0.00035197152783998165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document +0.0003549609834422404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document +0.0003568184100569753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document +0.0003512652818651935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document +0.00035912648958665754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document +0.00034764526964056546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document +0.000352439784960359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document +0.00035295886560764226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document +0.0003518132693658672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document +0.00035589987915465713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document +0.00034923863317385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document +0.0003457987267929692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document +0.0003560928663480501 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document +0.0003529603811204932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document +0.0003524438555443043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document +0.0003438847030263783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document +0.00035981978898461613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document +0.0003446342778566972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document +0.00035529584995236537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document +0.00034855740895831116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document +0.00034932634912802544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document +0.00035805518303064666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document +0.0003497941877073061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document +0.00035774398685405447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document +0.0003560421780316607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document +0.0003508844468369392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document +0.00035731928892270107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document +0.0003557884626314314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document +0.00034992996760289355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document +0.000360752554360921 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document +0.0003452321668708545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document +0.0003591745226131023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document +0.00035256981433229084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document +0.00035378123159712034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document +0.000350464354895999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document +0.00035074625557389677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document +0.00035025894701994667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document +0.00035437902514857614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document +0.0003514684519732232 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document +0.00035449717909633905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document +0.0003436816402714221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document +0.00035139158071782116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document +0.0003509424079843335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document +0.000343894618577506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document +0.0003500789770661659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document +0.0003407788080680086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document +0.0003581908175239701 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document +0.0003465541618780918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document +0.00034600228792437736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document +0.00034416738982773204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document +0.0003519900340150641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document +0.000343369616864659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document +0.0003544993883274688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document +0.0003504441365073392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document +0.00034859160702727056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document +0.00035355909532647185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document +0.0003471900922691849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document +0.0003563015508709187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document +0.0003487888744148821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document +0.00034711767548688336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document +0.0003530734609369085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document +0.00035123969242560935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document +0.0003517127620891489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document +0.00035232835416868673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document +0.0003524437481912308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document +0.0003525996167005602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document +0.00035064770545242043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document +0.00035311558274981226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document +0.00034952204800569914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document +0.0003541471367344846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document +0.00035418812454561825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document +0.0003528951372900714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document +0.0003542338042975688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document +0.00034937738939942796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document +0.0003522182190878447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document +0.0003501406466507449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document +0.00034973079877492633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document +0.0003485274567713538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document +0.00034999308679368985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document +0.0003570051724707296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document +0.00034567230462019706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document +0.00035529000940160696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document +0.00034956512308671755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document +0.0003496962834028953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document +0.0003468745282493457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document +0.0003502717155809202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document +0.0003556240880896514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document +0.0003515109488424343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document +0.0003563156688192592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document +0.00035040277363989817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document +0.0003481408593290717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document +0.0003624575124332874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document +0.0003522684124250313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document +0.00035286996027653544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document +0.00034967623997256725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document +0.00035182649587602765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document +0.0003524892557026489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document +0.0003507642477451811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document +0.00036190408389835666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document +0.00035102739424880766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document +0.00035239718753257265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document +0.00035298076121821316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document +0.0003478704389752654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document +0.0003503109191567942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document +0.00035143250975654426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document +0.0003480663923069012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document +0.00035691540219998623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document +0.000348815437166351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document +0.00035202073257766225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document +0.0003491569096274706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document +0.00035277390475511834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document +0.0003524972090026609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document +0.0003504854249750236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document +0.00034740238025423914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document +0.00034968015462277606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document +0.0003493798632762674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document +0.0003488202537862122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document +0.0003525461864643725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document +0.00034903815232825664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document +0.00035536982539258216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document +0.00034858083265155483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document +0.0003505014973608067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document +0.00035327984042622104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document +0.0003503286677453136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document +0.00035835274842442816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document +0.00034970302660275595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document +0.000357929573140149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document +0.0003517238649788585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document +0.00036097027318848475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document +0.0003502734074110026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document +0.00035801510806036273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document +0.0003568006373479869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document +0.00036128108717454636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document +0.0003563436883111686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document +0.00035559725321852463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document +0.00035089656006854944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document +0.000359453964362057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document +0.00035629498059104033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document +0.0003622207707090437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document +0.0003540946784512821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document +0.0003594750565232011 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document +0.0003566007415086991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document +0.0003562142599126134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document +0.0003569948186744601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document +0.00035166554847920186 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document +0.00035047994419295137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document +0.0003561578193739437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document +0.00035470866838811544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document +0.00034216920464876335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document +0.0003550021513075795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document +0.0003488045105938729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document +0.0003513340720840151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document +0.0003448558566387584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document +0.0003460966026953241 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document +0.0003488157616036459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document +0.0003446120387842362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document +0.000351528602987427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document +0.00035661118227454713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document +0.0003551342699877457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document +0.0003478953397924445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document +0.00034625782458988215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document +0.0003527515447405871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document +0.00034823744889805696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document +0.00034823314560254406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document +0.00035162668292961944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document +0.0003477307716074623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document +0.0003446457989477787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document +0.00034782916273767795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document +0.0003517249130302248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document +0.0003449873430908556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document +0.00034841291749669877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document +0.0003466028498941749 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document +0.0003486436831199424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document +0.0003478279234211838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document +0.0003495903653274374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document +0.00034896893881218957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document +0.000348941645312426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document +0.0003474221308416894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document +0.0003462621543839385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document +0.0003669373860863891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document +0.00034691156268163006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document +0.0003527774103765281 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document +0.00034684565672734663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document +0.0003454250599604457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document +0.0003541536557159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document +0.000345735737037366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document +0.0003524669816385214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document +0.0003441817133096468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document +0.0003519093265859089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document +0.00035080085480352095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document +0.00035285227929327434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document +0.00034354836346901676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document +0.00034789770937373467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document +0.000343665920520102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document +0.0003490884931060568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document +0.00034380029463398654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document +0.00034874768005099945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document +0.0003457058510967673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document +0.00034644265227023904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document +0.00035008339858594957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document +0.0003462377193296194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document +0.0003620491787114201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document +0.000348717011044469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document +0.00034370072363913706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document +0.0003551981066775649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document +0.0003500119496799342 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document +0.0003485082952669081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document +0.0003508155580978919 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document +0.00035311375163251416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document +0.00034945972003423253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document +0.0003474220353789879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document +0.0003536443686585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document +0.0003560350489042953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document +0.0003493655927914396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document +0.0003528423977146383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document +0.00035255554724471217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document +0.0003479760010190111 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document +0.00035458598862501956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document +0.0003458990560538315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document +0.00035157946422379875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document +0.00034736860650169996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document +0.0003529152313394119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document +0.00034586294329524465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document +0.00035707214923794877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document +0.0003509580363496512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document +0.00035244176725524474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document +0.0003467539557999047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document +0.00034919687962275546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document +0.00035094031731719953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document +0.0003484309008351352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document +0.0003485409424916253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document +0.0003499590776117838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document +0.0003492842758957848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document +0.0003529712275178912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document +0.0003566141287087449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document +0.0003649496522047409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document +0.0003563218912208234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document +0.00035614782126966145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document +0.0003531944298453266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document +0.0003535950949566616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document +0.0003544295554928795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document +0.0003519908503740376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document +0.00035752817626134463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document +0.0003515322689589972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document +0.0003486893890307115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document +0.0003446520464889867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document +0.0003509421562481707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document +0.00035335015702909084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document +0.0003490178167345008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document +0.0003520497821155174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document +0.0003549762618908944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document +0.00035072190850833103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document +0.0003542458638526423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document +0.000352419194572916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document +0.0003545102564672614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document +0.0003495437992331806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document +0.0003542843376993964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document +0.000352827529313958 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document +0.00035442506093223886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document +0.0003496970719044257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document +0.0003553096424442362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document +0.00034986845565067564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document +0.000352131055186658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document +0.0003527021708198983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document +0.00034905885414547214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document +0.0003583433842468394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document +0.00034409435202828383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document +0.00034846410520871483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document +0.0003554459991927314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document +0.00035310507471843076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document +0.000350028910786098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document +0.00035049727458009896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document +0.0003519047735925826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document +0.0003513027429919726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document +0.0003626947260354396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document +0.0003500087324849783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document +0.0003618315726725285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document +0.0003535385113938023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document +0.0003487064058517615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document +0.0003618709124780938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document +0.00035040070335625915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document +0.0003506279032267829 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document +0.0003498435310527524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document +0.0003554634749821431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document +0.00035091209738758963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document +0.00035034103678978573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document +0.00035398931854386146 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document +0.00035495529304989485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document +0.00036067883473356603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document +6.322825248625475e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document +2.4432314037946264e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document +5.6313888721313454e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document +2.4208171781595055e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document +2.325811856369237e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document +2.4010790356322705e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document +5.36773610843632e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document +1.360574433501002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document +1.3076540344853244e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document +1.3386534334886313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document +1.2498103719605153e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document +1.403763836949682e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document +1.3636756723495417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document +1.2242489446940814e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document +1.2398255818973339e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document +1.2972616994216281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document +1.3947809855914134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document +1.3144843787829514e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document +1.1693809976572487e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document +1.3677252682893802e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document +1.3940876719849597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document +1.4222245138730965e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document +1.3201677767919704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document +1.1421717796486169e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document +1.2890514724498703e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document +1.3649507648749037e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document +1.2400732563490717e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document +1.1557681453277616e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document +1.2294483595964517e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document +1.2137484472122283e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document +1.3299663426456e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document +1.2461984216479532e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document +1.4666434217609636e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document +1.1876997894686238e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document +1.2939155338964078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document +1.3859590039728515e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document +1.317917848615668e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document +1.1335281536110342e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document +1.2889923952861426e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document +1.3471671647053326e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document +1.2221720014475102e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document +1.2632647276287541e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document +1.28276219004076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document +1.36213704321643e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document +1.2414858625261553e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document +1.3173700421883744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document +1.295597796725686e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document +1.242783936442904e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document +1.2417374088427464e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document +1.2134479405400744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document +1.3090040663304255e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document +1.2713470581614905e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document +5.5750231378906594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document +5.777597358425469e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document +5.349786767471258e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document +5.675165050453583e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document +5.482611216158831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document +5.065421899890121e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document +5.384718357480146e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document +4.872037363236061e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document +4.532709250783155e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document +5.7257963030489613e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document +4.9014365579652036e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document +5.722863552770969e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document +6.149911636146833e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document +5.2178057608273506e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document +4.990228161160431e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document +5.866186875255134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document +5.004185734360719e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document +4.79401853705107e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document +5.435219965052376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document +5.035997225792266e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document +5.622401774211625e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document +5.028826157387559e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document +5.596379470128795e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document +6.027824493191489e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document +5.5358270009931474e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document +5.9839051807685496e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document +5.1221077499249595e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document +5.517228560620279e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document +5.1687858285052305e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document +5.684188244145645e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document +5.212693275535878e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document +4.8551007022784084e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document +5.4888506639203145e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document +5.345098688527242e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document +4.8506420625516594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document +5.132168603397676e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document +5.719476795114223e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document +5.7448621149792696e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document +4.9068410568059265e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document +5.382937299647678e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document +4.8288432136304634e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document +5.841703200305416e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document +5.1589611587885584e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document +6.031113829732574e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document +5.4558202844532094e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document +5.341852317196142e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document +5.1402942738369954e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document +5.735421384377395e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document +5.473629863586958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document +5.4708993245733936e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document +4.931161863634078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document +5.104173022127248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document +5.510157161510824e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document +5.652501401782597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document +5.7273656573031666e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document +5.638363224821738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document +5.6128115396668704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document +5.00304877998141e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document +5.596120554779096e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document +5.5280923889040006e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document +5.223477917938408e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document +5.29472809986569e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document +2.205682378243213e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document +1.4367563720603185e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document +3.5506193487931076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document +3.0442910855821778e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document +2.2540042508019627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document +2.6880163202623216e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document +2.534473148048727e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document +2.6560945431318916e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document +2.547470248967691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document +2.5248825388073738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document +2.5828729575000054e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document +2.4026583817957736e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document +2.3930425429834413e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document +2.5037365362599724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document +2.6696745470595603e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document +2.140323051341762e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document +2.617354786691592e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document +1.538359101762691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document +1.2871029252377856e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document +2.255195411289217e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document +2.4832313897952067e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document +9.303873918189968e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document +2.179532302620228e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document +1.9750517506901206e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document +2.7740420380648435e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document +2.7813714782319335e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document +4.1595357937609806e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document +2.741365122389175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document +2.117451071361901e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document +1.7132649760565998e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document +1.7492547092602047e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document +1.7499951097392276e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document +1.6632444789170958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document +1.6678802252361607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document +1.5519208704558896e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document +1.652420992967167e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document +1.6119931034508755e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document +1.6638882076736552e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document +1.7198076782652946e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document +1.572927860565175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document +1.5194822618169918e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document +1.6677776832669846e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document +1.595612492245688e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document +1.682350633181197e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document +1.663983380609724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document +1.710187842689243e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document +1.5733697527539038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document +1.6972104757911438e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document +1.6610142847616577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document +1.61094882403031e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document +1.4789207305138325e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document +1.639299617676302e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document +1.3241204512116132e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document +8.582260726625535e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document +8.213000975576739e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document +9.549247732811947e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document +9.17242785339013e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document +7.632868223725218e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document +8.674401118222175e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document +9.124384255505347e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document +8.344222222417358e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document +8.992299957499065e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document +8.76689497361025e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document +7.973396239586015e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document +9.006935606644125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document +8.725545954955498e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document +1.215449694669174e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document +3.3041720284158646e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document +2.0593512412624502e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document +1.893608946986248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document +1.737111666788535e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document +1.4915923449873955e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document +2.289370239067605e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document +2.8615335689614638e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document +8.847283630883125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document +1.8175470362373804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document +1.8152226683368038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document +1.789149655314284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document +1.7690523036477663e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document +1.8333732213753644e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document +1.8794105687718654e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document +1.721841156706417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document +2.0612008685724796e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document +1.9297370681336376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document +2.0188440409661018e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document +5.1741216329695265e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document +1.3417913926038429e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document +1.1010813016469651e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document +1.1252416134320087e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document +1.2801744104313002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document +1.3041514955795817e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document +1.3428837580879075e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document +1.320809382267804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document +1.3451566676555968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document +1.228284926657501e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document +1.2410599573923043e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document +1.3815343367377182e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document +1.3895126265148832e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document +1.2306773644401741e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document +1.32981021906281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document +1.101337469221607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document +1.513094184404692e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document +1.1073759547073234e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document +1.2879348765857567e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document +9.619595770228435e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document +1.2384340836286436e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document +1.1766667232211577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document +1.2871049236196452e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document +1.2010645926497744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document +1.3971428231518597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document +1.2283733550547932e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document +1.2659530508255308e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document +1.551775613074462e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document +1.1169413343776979e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document +1.1433700593712463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document +4.964773647323492e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document +1.0995586595687313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document +1.2957393071411267e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document +2.75899247407709e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document +2.8269344597344854e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document +2.329108187246831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document +2.4231761430460284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document +1.2434140512230442e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document +1.638718338352859e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document +3.272953556801187e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document +6.061314500486327e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document +1.2465979731210292e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document +1.2737557327967737e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document +1.038428658075627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document +2.61666472045566e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document +3.6506873212272224e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document +1.5066359138295701e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document +1.1166290872121178e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document +1.5546966228590285e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document +1.2583434625014828e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document +1.3398826881300862e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document +1.2944933160515968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document +1.0971437399901365e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document +1.2787922795775774e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document +1.404979227816985e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document +1.3344734431324463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document +4.886031157107555e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document +3.277261443596394e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document +3.5057957685786495e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document +3.287625301718589e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document +3.1370056372668855e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document +3.186092015785841e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document +7.271819324142512e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document +0.001451215788905126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document +0.0014486847196258788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document +0.0008861032722895899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document +0.0018119590809459816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document +0.0008916937917547129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document +6.960128832809415e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document +0.002008403651063623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document +0.0014374900742131454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document +0.00180213596996716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document +0.001956178877532413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document +0.0008829547017667033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document +0.0008910853619157279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document +0.0018260998845299973 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document +0.0012499632072059553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document +0.00125398260359913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document +0.0012541704774729071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document +0.0012527268234360602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document +0.0012532925243737164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document +0.0012456396241204315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document +0.0012589894424352072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document +0.001508020123999618 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document +0.00333096950781965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document +0.0033233414614415547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document +0.003512387990689828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document +0.0035091382940513126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document +0.003514155927147005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document +0.003327108000579638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document +0.003329106196589836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document +0.003505604148738077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document +0.003324825759567855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document +0.0033248240149804913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document +0.0033385962112851358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document +0.0035043186296553615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document +0.003340469505431529 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document +0.0035106889084796276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document +0.0033309469281030167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document +0.003340337858029757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document +0.003505919861097801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document +0.0003882924098240512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document +0.0005759963691850877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document +0.0005959971675332674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document +0.0006026179290353799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document +0.0005824184320784846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document +0.0005854598548616037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document +0.0005903767055633473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document +0.0005930306490982049 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document +0.000569425602700746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document +0.0005675060415179408 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document +0.0005772431621253389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document +0.0005678026053826858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document +0.0005700398263483378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document +0.0005669467963528824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document +0.0005701015953324305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document +0.0005795907287413296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document +0.0005735602737531164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document +0.0005749862745842101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document +0.0005693257015931971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document +0.0005716568794795563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document +0.0005761083919774021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document +0.0005688343169797355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document +0.0005807913190929842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document +0.0005710229258078636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document +0.0005704083039826862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document +0.0005862132348308056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document +0.0005717662049559556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document +0.0005858155213694451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document +0.0005812012281792392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document +0.0005803981414588498 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document +0.0005700102108287723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document +0.0005719243459052329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document +0.0005867253401661752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document +0.0005731087218860733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document +0.0005712197789109317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document +0.0005702376926310089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document +0.0005700411527742972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document +0.0005828090098178196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document +0.0005770140826168056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document +0.0005723509664597896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document +0.0005755499231836962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document +0.0005636407438471367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document +0.0005640281556500104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document +0.0005633159058766496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document +0.0005638034311151449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document +0.0005630066273073224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document +0.0005631803831128559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document +0.0005631228881679657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document +0.0005628178701487633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document +0.0005624448092256196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document +0.0005620957024062329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document +0.0005614201504177484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document +0.0005616890951464056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document +0.0005611348559279058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document +0.0005604238061828518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document +0.0005603301490194237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document +0.0005607291294548833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document +0.0005605234569930727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document +0.0005613778566640694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document +0.0005610248539992471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document +0.0005599977416780475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document +0.0005603632562116935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document +0.0005599177479509897 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document +0.0005595202318298379 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document +0.0005600975633499175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document +0.0005614075491213365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document +0.000612563885043477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document +0.0005515469909644413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document +0.0005526782014946906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document +0.0005472463408095445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document +0.0005502284746004587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document +0.0005414514790555363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document +0.0005513499500134784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document +0.0005391391454105187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document +0.0005415836910001838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document +0.0005208132468536551 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document +0.0005889827143132871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document +0.0005822520817765276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document +0.0004173155230758696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document +0.0009994361338078242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document +0.001087156194657966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document +0.0010667737163656816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document +0.0009602877882124873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document +0.0008968956271971105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document +0.0009198034843762967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document +0.0009423901016715341 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document +0.0009674094553686345 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document +0.0009858331322519164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document +0.0009970593645879198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document +0.0010027035193731686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document +0.0010128291154221853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document +0.0010215631382631918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document +0.0010288663771461238 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document +0.0010346219929285867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document +0.00104544019940344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document +0.0010525172676724333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document +0.0010609529620775127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document +0.0010725892748610153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document +0.0010818563598181568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document +0.0010992760196793917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document +0.0011178992762079917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document +0.001124687532085676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document +0.001118303661267191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document +0.0010206825575416534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document +0.0005512280117499715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document +0.004474659408857016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document +0.00409944473890653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document +0.005137179939941845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document +0.005143172251066109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document +0.005206134363352808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document +0.004892747858974329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document +0.004844731352552902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document +0.005308320169123755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document +0.005124709815666577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document +0.005424710744483826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document +0.00538244648861977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document +0.0029107284679086853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document +0.0026825258998444705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document +0.0026904503191419243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document +0.002687906577174073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document +0.002850165346048818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document +0.005322698571717847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document +0.004450334290869719 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document +0.004700990083440683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document +0.003903568556500995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document +0.00390561515396931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document +0.0039046402900912262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document +0.003907454839379547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document +0.0038583224578603824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document +0.0037914116657695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document +0.003786665266798682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document +0.003792000802430658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document +0.00319266847466091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document +0.0032658716699838944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document +0.0034801959532460023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document +0.0028307012092022594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document +0.0028420360878146276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document +0.0028410455248484914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document +0.00283497183526842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document +0.002840187195459487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document +0.0028398709431369834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document +0.004364722843422023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document +0.004093255713117101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document +0.004092331079566252 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document +0.004005326985579649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document +0.0036205502856964207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document +0.003625316793034984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document +0.003604743435602363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document +0.0035405823343673125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document +0.0041601413517253945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document +0.005886303658937057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document +0.003600909532810332 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document +0.0034941365817168658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document +0.0004992164842980224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document +0.00032927705604725614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document +0.0002860154190878753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document +0.0002845217585425619 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document +0.0002743528685497456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document +0.00026025323737738766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document +0.00023493876414603155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document +0.00029665994994226705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document +0.00031808102075993956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document +0.00031813573046011285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document +0.0002711905171855542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document +0.00028892513401817095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document +0.00030003908676979083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document +0.00026839878771944684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document +0.00029155935002690497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document +0.0002998624927624209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document +0.0003091705447974841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document +0.00026873195794309786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document +0.00027721873498527547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document +0.0002841662554024377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document +0.0002839461156551537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document +0.0002861705604659811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document +0.0002460995649635886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document +0.00019420142619795496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document +0.00021967677816173628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document +0.0002620283200480949 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document +0.0002433390542188936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document +0.00021254976608350767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document +0.00022094815569522115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document +0.000342862378668244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document +0.00033784225259118157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document +0.0003367278459543952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document +0.00029843279042852765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document +0.0002926583661257988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document +0.00029320337282010673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document +0.00029281450669483455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document +0.0002915338187002653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document +0.0002864226923084572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document +0.00028643439083586396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document +0.00028253710956299054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document +0.0002810856078805806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document +0.00031474941344656715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document +0.0002139130222205655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document +0.0003084648871862831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document +0.0003309477872140129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document +0.0003360096824695161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document +0.0003355452655196557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document +0.00038119390366386037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document +0.00038078927630086064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document +0.0003386200917551554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document +0.0002158905159938882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document +0.00021621682877018768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document +0.00021553306942740535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document +0.00021581563462722296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document +0.0002157694110556169 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document +0.000215643699847159 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document +0.00021532716715168094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document +0.00021531221326022472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document +0.0002831801179028896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document +0.0002514844936507595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document +0.00031638782778107964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document +0.0002749197545278445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document +0.00026159721512464495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document +0.0002630052420096968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document +0.00031106811228913666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document +0.0002852973415334161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document +3.7555372465932136e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document +0.003548077173506675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document +0.0018372203137874265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document diff --git a/ALCF/data-lists/sunspot/falcon.txt b/ALCF/data-lists/sunspot/falcon.txt new file mode 100644 index 0000000000..0b2fd6d43f --- /dev/null +++ b/ALCF/data-lists/sunspot/falcon.txt @@ -0,0 +1,501 @@ +0.0003547982093445404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document +0.00035934014428504944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document +0.00035707704501371544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document +0.00035287930712815354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document +0.00035977166728996823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document +0.0003581675664109838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document +0.0003548617059697185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document +0.0003639582000286208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document +0.00035375839698688127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document +0.0003743722020080678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document +0.0003530399715341242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document +0.00035511875882752406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document +0.0003618733574783154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document +0.00035185243285420104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document +0.0003541503739732106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document +0.0003631679485751914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document +0.00035748045578182274 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document +0.0003606490690555877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document +0.0003626383296610091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document +0.00035442644361264756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document +0.00035978370170539796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document +0.0003585562375341541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document +0.0003601958372888019 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document +0.000350277765402227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document +0.0003616521184211704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document +0.0003620625543608188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document +0.0003560781983850704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document +0.0003553209610592676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document +0.00035905348643915075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document +0.00034744258805696526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document +0.00035462784035661496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document +0.00034768186175100895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document +0.0003568534635532736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document +0.00035586511544371234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document +0.0003524567827568137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document +0.0003512453770426313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document +0.0003591792726468799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document +0.0003514024529343127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document +0.0003584880112586934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document +0.00035133552916418045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document +0.0003600811981350215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document +0.0003571663974228119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document +0.00035768103378874214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document +0.00035939205561113694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document +0.00035186773916029825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document +0.0003542829672490847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document +0.0003592783642898726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document +0.0003556367340099302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document +0.00035391392271377027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document +0.00035486725707484836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document +0.00034866743396828035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document +0.0003517219808644735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document +0.00034874458549673823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document +0.000355773136961014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document +0.00035611750387841917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document +0.00035305602013916315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document +0.0003578207127071924 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document +0.00035514635841943707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document +0.00034816946212866206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document +0.0003512707269761496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document +0.0003483392117980654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document +0.0003572169607204321 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document +0.00035139153281660794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document +0.00035536422129036537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document +0.000352017164107143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document +0.000351889550179365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document +0.000358759689953589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document +0.0003569286079869268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document +0.0003657752958602099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document +0.00035396127934790697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document +0.0003618565071224743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document +0.00035146051531973204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document +0.00036107135765783567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document +0.00035019554279994576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document +0.00035567858879904983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document +0.0003504753174793183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document +0.00035931140831329194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document +0.0003502967866002823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document +0.0003532911801041972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document +0.0003583543013070199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document +0.0003566243489931224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document +0.0003468752314799221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document +0.0003597840618138091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document +0.00035128822484768084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document +0.00035889496943437507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document +0.000352400524650424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document +0.0003518689536768735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document +0.00035866864741303467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document +0.0003454687659106334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document +0.00035348007259317576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document +0.0003539752270940644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document +0.00035146495994081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document +0.00035397212846310423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document +0.00035208246467162587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document +0.0003490843168676626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document +0.00035299633658644394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document +0.00034868327466167065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document +0.00035941351365601583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document +0.0003545343062735255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document +0.0003528956380445978 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document +0.0003553355770443352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document +0.0003644224004937743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document +0.00035234291036216907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document +0.0003596237469847771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document +0.0003531996065735989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document +0.0003547177054106099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document +0.0003575586499260483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document +0.00035262635135283667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document +0.0003624191962188944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document +0.0003488398052948616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document +0.0003598294093147917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document +0.00035583006534466323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document +0.00035403139653225103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document +0.00036134702642187156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document +0.0003573689927162834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document +0.0003577141131435527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document +0.00035208814419277406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document +0.00035996720683665625 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document +0.00035415304658912596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document +0.00036353353029443546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document +0.0003537326003150983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document +0.00036053976358299083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document +0.000352380489373494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document +0.00036154661616900994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document +0.00035959332325963614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document +0.0003597954667189692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document +0.0003563108270597542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document +0.0003582891940460143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document +0.0003497728210484297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document +0.0003549834902179354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document +0.0003529828233484542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document +0.00034627483903285777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document +0.00035569006572589215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document +0.00035449377946910314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document +0.00035802844396194623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document +0.0003617277809353208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document +0.00035034118898654814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document +0.000351091193908611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document +0.0003527914342210668 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document +0.00035028288369781376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document +0.00035775745592780506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document +0.0003449630690661468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document +0.0003583490698830361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document +0.0003476995746684122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document +0.0003535632505019212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document +0.00035640180641147417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document +0.000361731045691765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document +0.0003534082129597368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document +0.0003550344149828664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document +0.00035363002411364057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document +0.0003537265579677396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document +0.00034950531383577937 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document +0.00035008511827347514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document +0.00035594533400871325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document +0.00035266312861335946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document +0.00035280268794863923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document +0.0003565470391528536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document +0.0003588492322689137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document +0.00035469909697832775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document +0.00034712082813410526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document +0.000348701157101807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document +0.0003500192014479944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document +0.00035120560544669755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document +0.00035403656850437445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document +0.00035852376560749366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document +0.0003534754068111774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document +0.00035591740046720765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document +0.000348522354782563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document +0.0003533533959664415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document +0.00035631425964030697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document +0.0003485886551574741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document +0.00035917652631065777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document +0.0003482975272111288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document +0.00035580661277480167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document +0.0003492290722955348 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document +0.00034989284450240613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document +0.0003545677216162781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document +0.00034622286859463484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document +0.00036070626989861965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document +0.00035518365036320786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document +0.00035272907057848406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document +0.0003547343638218734 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document +0.0003496450144966242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document +0.0003537407829294287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document +0.0003489722653985685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document +0.00035057186899911295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document +0.0003507566548933051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document +0.00035630360179023747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document +0.00035631362503416367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document +0.0003490204248026821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document +0.00035761724058371226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document +0.00035037664777467137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document +0.000353402110481068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document +0.00034524163568371745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document +0.00035528523728570974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document +0.00034784916132431703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document +0.00034928476408048925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document +0.00034989205973784984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document +0.00034201664404094254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document +0.0003529676016338611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document +0.00034643433682346637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document +0.0003511666373001904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document +0.00034828669066575333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document +0.0003494625207264413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document +0.0003458957535879216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document +0.0003543020478990003 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document +0.00034754384069014956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document +0.0003598856392240133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document +0.0003503335458553846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document +0.00035919595619778716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document +0.00035767737970754404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document +0.00035197152783998165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document +0.0003549609834422404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document +0.0003568184100569753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document +0.0003512652818651935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document +0.00035912648958665754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document +0.00034764526964056546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document +0.000352439784960359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document +0.00035295886560764226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document +0.0003518132693658672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document +0.00035589987915465713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document +0.00034923863317385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document +0.0003457987267929692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document +0.0003560928663480501 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document +0.0003529603811204932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document +0.0003524438555443043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document +0.0003438847030263783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document +0.00035981978898461613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document +0.0003446342778566972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document +0.00035529584995236537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document +0.00034855740895831116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document +0.00034932634912802544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document +0.00035805518303064666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document +0.0003497941877073061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document +0.00035774398685405447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document +0.0003560421780316607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document +0.0003508844468369392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document +0.00035731928892270107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document +0.0003557884626314314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document +0.00034992996760289355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document +0.000360752554360921 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document +0.0003452321668708545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document +0.0003591745226131023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document +0.00035256981433229084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document +0.00035378123159712034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document +0.000350464354895999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document +0.00035074625557389677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document +0.00035025894701994667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document +0.00035437902514857614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document +0.0003514684519732232 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document +0.00035449717909633905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document +0.0003436816402714221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document +0.00035139158071782116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document +0.0003509424079843335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document +0.000343894618577506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document +0.0003500789770661659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document +0.0003407788080680086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document +0.0003581908175239701 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document +0.0003465541618780918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document +0.00034600228792437736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document +0.00034416738982773204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document +0.0003519900340150641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document +0.000343369616864659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document +0.0003544993883274688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document +0.0003504441365073392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document +0.00034859160702727056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document +0.00035355909532647185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document +0.0003471900922691849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document +0.0003563015508709187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document +0.0003487888744148821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document +0.00034711767548688336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document +0.0003530734609369085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document +0.00035123969242560935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document +0.0003517127620891489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document +0.00035232835416868673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document +0.0003524437481912308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document +0.0003525996167005602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document +0.00035064770545242043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document +0.00035311558274981226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document +0.00034952204800569914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document +0.0003541471367344846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document +0.00035418812454561825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document +0.0003528951372900714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document +0.0003542338042975688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document +0.00034937738939942796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document +0.0003522182190878447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document +0.0003501406466507449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document +0.00034973079877492633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document +0.0003485274567713538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document +0.00034999308679368985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document +0.0003570051724707296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document +0.00034567230462019706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document +0.00035529000940160696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document +0.00034956512308671755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document +0.0003496962834028953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document +0.0003468745282493457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document +0.0003502717155809202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document +0.0003556240880896514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document +0.0003515109488424343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document +0.0003563156688192592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document +0.00035040277363989817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document +0.0003481408593290717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document +0.0003624575124332874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document +0.0003522684124250313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document +0.00035286996027653544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document +0.00034967623997256725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document +0.00035182649587602765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document +0.0003524892557026489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document +0.0003507642477451811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document +0.00036190408389835666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document +0.00035102739424880766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document +0.00035239718753257265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document +0.00035298076121821316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document +0.0003478704389752654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document +0.0003503109191567942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document +0.00035143250975654426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document +0.0003480663923069012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document +0.00035691540219998623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document +0.000348815437166351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document +0.00035202073257766225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document +0.0003491569096274706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document +0.00035277390475511834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document +0.0003524972090026609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document +0.0003504854249750236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document +0.00034740238025423914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document +0.00034968015462277606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document +0.0003493798632762674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document +0.0003488202537862122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document +0.0003525461864643725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document +0.00034903815232825664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document +0.00035536982539258216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document +0.00034858083265155483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document +0.0003505014973608067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document +0.00035327984042622104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document +0.0003503286677453136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document +0.00035835274842442816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document +0.00034970302660275595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document +0.000357929573140149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document +0.0003517238649788585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document +0.00036097027318848475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document +0.0003502734074110026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document +0.00035801510806036273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document +0.0003568006373479869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document +0.00036128108717454636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document +0.0003563436883111686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document +0.00035559725321852463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document +0.00035089656006854944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document +0.000359453964362057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document +0.00035629498059104033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document +0.0003622207707090437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document +0.0003540946784512821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document +0.0003594750565232011 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document +0.0003566007415086991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document +0.0003562142599126134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document +0.0003569948186744601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document +0.00035166554847920186 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document +0.00035047994419295137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document +0.0003561578193739437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document +0.00035470866838811544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document +0.00034216920464876335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document +0.0003550021513075795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document +0.0003488045105938729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document +0.0003513340720840151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document +0.0003448558566387584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document +0.0003460966026953241 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document +0.0003488157616036459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document +0.0003446120387842362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document +0.000351528602987427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document +0.00035661118227454713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document +0.0003551342699877457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document +0.0003478953397924445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document +0.00034625782458988215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document +0.0003527515447405871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document +0.00034823744889805696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document +0.00034823314560254406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document +0.00035162668292961944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document +0.0003477307716074623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document +0.0003446457989477787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document +0.00034782916273767795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document +0.0003517249130302248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document +0.0003449873430908556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document +0.00034841291749669877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document +0.0003466028498941749 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document +0.0003486436831199424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document +0.0003478279234211838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document +0.0003495903653274374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document +0.00034896893881218957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document +0.000348941645312426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document +0.0003474221308416894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document +0.0003462621543839385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document +0.0003669373860863891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document +0.00034691156268163006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document +0.0003527774103765281 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document +0.00034684565672734663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document +0.0003454250599604457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document +0.0003541536557159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document +0.000345735737037366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document +0.0003524669816385214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document +0.0003441817133096468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document +0.0003519093265859089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document +0.00035080085480352095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document +0.00035285227929327434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document +0.00034354836346901676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document +0.00034789770937373467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document +0.000343665920520102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document +0.0003490884931060568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document +0.00034380029463398654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document +0.00034874768005099945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document +0.0003457058510967673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document +0.00034644265227023904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document +0.00035008339858594957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document +0.0003462377193296194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document +0.0003620491787114201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document +0.000348717011044469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document +0.00034370072363913706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document +0.0003551981066775649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document +0.0003500119496799342 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document +0.0003485082952669081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document +0.0003508155580978919 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document +0.00035311375163251416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document +0.00034945972003423253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document +0.0003474220353789879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document +0.0003536443686585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document +0.0003560350489042953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document +0.0003493655927914396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document +0.0003528423977146383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document +0.00035255554724471217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document +0.0003479760010190111 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document +0.00035458598862501956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document +0.0003458990560538315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document +0.00035157946422379875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document +0.00034736860650169996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document +0.0003529152313394119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document +0.00034586294329524465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document +0.00035707214923794877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document +0.0003509580363496512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document +0.00035244176725524474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document +0.0003467539557999047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document +0.00034919687962275546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document +0.00035094031731719953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document +0.0003484309008351352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document +0.0003485409424916253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document +0.0003499590776117838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document +0.0003492842758957848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document +0.0003529712275178912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document +0.0003566141287087449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document +0.0003649496522047409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document +0.0003563218912208234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document +0.00035614782126966145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document +0.0003531944298453266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document +0.0003535950949566616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document +0.0003544295554928795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document +0.0003519908503740376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document +0.00035752817626134463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document +0.0003515322689589972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document +0.0003486893890307115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document +0.0003446520464889867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document +0.0003509421562481707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document +0.00035335015702909084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document +0.0003490178167345008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document +0.0003520497821155174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document +0.0003549762618908944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document +0.00035072190850833103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document +0.0003542458638526423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document +0.000352419194572916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document +0.0003545102564672614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document +0.0003495437992331806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document +0.0003542843376993964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document +0.000352827529313958 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document +0.00035442506093223886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document +0.0003496970719044257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document +0.0003553096424442362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document +0.00034986845565067564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document +0.000352131055186658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document +0.0003527021708198983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document +0.00034905885414547214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document +0.0003583433842468394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document +0.00034409435202828383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document +0.00034846410520871483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document +0.0003554459991927314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document +0.00035310507471843076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document +0.000350028910786098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document +0.00035049727458009896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document +0.0003519047735925826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document +0.0003513027429919726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document +0.0003626947260354396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document +0.0003500087324849783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document +0.0003618315726725285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document +0.0003535385113938023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document +0.0003487064058517615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document +0.0003618709124780938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document +0.00035040070335625915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document +0.0003506279032267829 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document +0.0003498435310527524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document +0.0003554634749821431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document +0.00035091209738758963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document +0.00035034103678978573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document +0.00035398931854386146 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document +0.00035495529304989485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document +0.00036067883473356603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document + diff --git a/ALCF/data-lists/sunspot/megawiki.txt b/ALCF/data-lists/sunspot/megawiki.txt new file mode 100644 index 0000000000..9fc9ca5dab --- /dev/null +++ b/ALCF/data-lists/sunspot/megawiki.txt @@ -0,0 +1,262 @@ +6.322825248625475e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document +2.4432314037946264e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document +5.6313888721313454e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document +2.4208171781595055e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document +2.325811856369237e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document +2.4010790356322705e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document +5.36773610843632e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document +1.360574433501002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document +1.3076540344853244e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document +1.3386534334886313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document +1.2498103719605153e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document +1.403763836949682e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document +1.3636756723495417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document +1.2242489446940814e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document +1.2398255818973339e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document +1.2972616994216281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document +1.3947809855914134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document +1.3144843787829514e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document +1.1693809976572487e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document +1.3677252682893802e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document +1.3940876719849597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document +1.4222245138730965e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document +1.3201677767919704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document +1.1421717796486169e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document +1.2890514724498703e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document +1.3649507648749037e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document +1.2400732563490717e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document +1.1557681453277616e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document +1.2294483595964517e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document +1.2137484472122283e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document +1.3299663426456e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document +1.2461984216479532e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document +1.4666434217609636e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document +1.1876997894686238e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document +1.2939155338964078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document +1.3859590039728515e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document +1.317917848615668e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document +1.1335281536110342e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document +1.2889923952861426e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document +1.3471671647053326e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document +1.2221720014475102e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document +1.2632647276287541e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document +1.28276219004076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document +1.36213704321643e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document +1.2414858625261553e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document +1.3173700421883744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document +1.295597796725686e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document +1.242783936442904e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document +1.2417374088427464e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document +1.2134479405400744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document +1.3090040663304255e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document +1.2713470581614905e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document +5.5750231378906594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document +5.777597358425469e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document +5.349786767471258e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document +5.675165050453583e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document +5.482611216158831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document +5.065421899890121e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document +5.384718357480146e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document +4.872037363236061e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document +4.532709250783155e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document +5.7257963030489613e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document +4.9014365579652036e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document +5.722863552770969e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document +6.149911636146833e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document +5.2178057608273506e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document +4.990228161160431e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document +5.866186875255134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document +5.004185734360719e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document +4.79401853705107e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document +5.435219965052376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document +5.035997225792266e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document +5.622401774211625e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document +5.028826157387559e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document +5.596379470128795e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document +6.027824493191489e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document +5.5358270009931474e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document +5.9839051807685496e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document +5.1221077499249595e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document +5.517228560620279e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document +5.1687858285052305e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document +5.684188244145645e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document +5.212693275535878e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document +4.8551007022784084e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document +5.4888506639203145e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document +5.345098688527242e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document +4.8506420625516594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document +5.132168603397676e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document +5.719476795114223e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document +5.7448621149792696e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document +4.9068410568059265e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document +5.382937299647678e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document +4.8288432136304634e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document +5.841703200305416e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document +5.1589611587885584e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document +6.031113829732574e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document +5.4558202844532094e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document +5.341852317196142e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document +5.1402942738369954e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document +5.735421384377395e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document +5.473629863586958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document +5.4708993245733936e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document +4.931161863634078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document +5.104173022127248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document +5.510157161510824e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document +5.652501401782597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document +5.7273656573031666e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document +5.638363224821738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document +5.6128115396668704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document +5.00304877998141e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document +5.596120554779096e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document +5.5280923889040006e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document +5.223477917938408e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document +5.29472809986569e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document +2.205682378243213e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document +1.4367563720603185e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document +3.5506193487931076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document +3.0442910855821778e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document +2.2540042508019627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document +2.6880163202623216e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document +2.534473148048727e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document +2.6560945431318916e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document +2.547470248967691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document +2.5248825388073738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document +2.5828729575000054e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document +2.4026583817957736e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document +2.3930425429834413e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document +2.5037365362599724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document +2.6696745470595603e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document +2.140323051341762e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document +2.617354786691592e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document +1.538359101762691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document +1.2871029252377856e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document +2.255195411289217e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document +2.4832313897952067e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document +9.303873918189968e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document +2.179532302620228e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document +1.9750517506901206e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document +2.7740420380648435e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document +2.7813714782319335e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document +4.1595357937609806e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document +2.741365122389175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document +2.117451071361901e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document +1.7132649760565998e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document +1.7492547092602047e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document +1.7499951097392276e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document +1.6632444789170958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document +1.6678802252361607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document +1.5519208704558896e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document +1.652420992967167e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document +1.6119931034508755e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document +1.6638882076736552e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document +1.7198076782652946e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document +1.572927860565175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document +1.5194822618169918e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document +1.6677776832669846e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document +1.595612492245688e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document +1.682350633181197e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document +1.663983380609724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document +1.710187842689243e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document +1.5733697527539038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document +1.6972104757911438e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document +1.6610142847616577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document +1.61094882403031e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document +1.4789207305138325e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document +1.639299617676302e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document +1.3241204512116132e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document +8.582260726625535e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document +8.213000975576739e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document +9.549247732811947e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document +9.17242785339013e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document +7.632868223725218e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document +8.674401118222175e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document +9.124384255505347e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document +8.344222222417358e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document +8.992299957499065e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document +8.76689497361025e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document +7.973396239586015e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document +9.006935606644125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document +8.725545954955498e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document +1.215449694669174e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document +3.3041720284158646e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document +2.0593512412624502e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document +1.893608946986248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document +1.737111666788535e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document +1.4915923449873955e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document +2.289370239067605e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document +2.8615335689614638e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document +8.847283630883125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document +1.8175470362373804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document +1.8152226683368038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document +1.789149655314284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document +1.7690523036477663e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document +1.8333732213753644e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document +1.8794105687718654e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document +1.721841156706417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document +2.0612008685724796e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document +1.9297370681336376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document +2.0188440409661018e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document +5.1741216329695265e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document +1.3417913926038429e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document +1.1010813016469651e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document +1.1252416134320087e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document +1.2801744104313002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document +1.3041514955795817e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document +1.3428837580879075e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document +1.320809382267804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document +1.3451566676555968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document +1.228284926657501e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document +1.2410599573923043e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document +1.3815343367377182e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document +1.3895126265148832e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document +1.2306773644401741e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document +1.32981021906281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document +1.101337469221607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document +1.513094184404692e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document +1.1073759547073234e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document +1.2879348765857567e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document +9.619595770228435e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document +1.2384340836286436e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document +1.1766667232211577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document +1.2871049236196452e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document +1.2010645926497744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document +1.3971428231518597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document +1.2283733550547932e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document +1.2659530508255308e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document +1.551775613074462e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document +1.1169413343776979e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document +1.1433700593712463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document +4.964773647323492e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document +1.0995586595687313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document +1.2957393071411267e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document +2.75899247407709e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document +2.8269344597344854e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document +2.329108187246831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document +2.4231761430460284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document +1.2434140512230442e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document +1.638718338352859e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document +3.272953556801187e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document +6.061314500486327e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document +1.2465979731210292e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document +1.2737557327967737e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document +1.038428658075627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document +2.61666472045566e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document +3.6506873212272224e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document +1.5066359138295701e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document +1.1166290872121178e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document +1.5546966228590285e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document +1.2583434625014828e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document +1.3398826881300862e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document +1.2944933160515968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document +1.0971437399901365e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document +1.2787922795775774e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document +1.404979227816985e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document +1.3344734431324463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document +4.886031157107555e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document +3.277261443596394e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document +3.5057957685786495e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document +3.287625301718589e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document +3.1370056372668855e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document +3.186092015785841e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document +7.271819324142512e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document diff --git a/ALCF/data-lists/sunspot/open-web-math-train.txt b/ALCF/data-lists/sunspot/open-web-math-train.txt new file mode 100644 index 0000000000..b36e9977c0 --- /dev/null +++ b/ALCF/data-lists/sunspot/open-web-math-train.txt @@ -0,0 +1,13 @@ +0.001451215788905126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document +0.0014486847196258788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document +0.0008861032722895899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document +0.0018119590809459816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document +0.0008916937917547129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document +6.960128832809415e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document +0.002008403651063623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document +0.0014374900742131454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document +0.00180213596996716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document +0.001956178877532413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document +0.0008829547017667033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document +0.0008910853619157279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document +0.0018260998845299973 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document diff --git a/ALCF/data-lists/sunspot/pes2o.txt b/ALCF/data-lists/sunspot/pes2o.txt new file mode 100644 index 0000000000..63f805c06d --- /dev/null +++ b/ALCF/data-lists/sunspot/pes2o.txt @@ -0,0 +1,26 @@ +0.0012499632072059553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document +0.00125398260359913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document +0.0012541704774729071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document +0.0012527268234360602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document +0.0012532925243737164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document +0.0012456396241204315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document +0.0012589894424352072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document +0.001508020123999618 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document +0.00333096950781965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document +0.0033233414614415547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document +0.003512387990689828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document +0.0035091382940513126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document +0.003514155927147005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document +0.003327108000579638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document +0.003329106196589836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document +0.003505604148738077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document +0.003324825759567855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document +0.0033248240149804913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document +0.0033385962112851358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document +0.0035043186296553615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document +0.003340469505431529 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document +0.0035106889084796276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document +0.0033309469281030167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document +0.003340337858029757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document +0.003505919861097801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document +0.0003882924098240512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document diff --git a/ALCF/data-lists/sunspot/reddit.txt b/ALCF/data-lists/sunspot/reddit.txt new file mode 100644 index 0000000000..59eafce1ee --- /dev/null +++ b/ALCF/data-lists/sunspot/reddit.txt @@ -0,0 +1,78 @@ +0.0005759963691850877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document +0.0005959971675332674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document +0.0006026179290353799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document +0.0005824184320784846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document +0.0005854598548616037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document +0.0005903767055633473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document +0.0005930306490982049 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document +0.000569425602700746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document +0.0005675060415179408 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document +0.0005772431621253389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document +0.0005678026053826858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document +0.0005700398263483378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document +0.0005669467963528824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document +0.0005701015953324305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document +0.0005795907287413296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document +0.0005735602737531164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document +0.0005749862745842101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document +0.0005693257015931971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document +0.0005716568794795563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document +0.0005761083919774021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document +0.0005688343169797355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document +0.0005807913190929842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document +0.0005710229258078636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document +0.0005704083039826862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document +0.0005862132348308056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document +0.0005717662049559556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document +0.0005858155213694451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document +0.0005812012281792392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document +0.0005803981414588498 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document +0.0005700102108287723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document +0.0005719243459052329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document +0.0005867253401661752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document +0.0005731087218860733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document +0.0005712197789109317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document +0.0005702376926310089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document +0.0005700411527742972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document +0.0005828090098178196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document +0.0005770140826168056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document +0.0005723509664597896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document +0.0005755499231836962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document +0.0005636407438471367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document +0.0005640281556500104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document +0.0005633159058766496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document +0.0005638034311151449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document +0.0005630066273073224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document +0.0005631803831128559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document +0.0005631228881679657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document +0.0005628178701487633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document +0.0005624448092256196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document +0.0005620957024062329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document +0.0005614201504177484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document +0.0005616890951464056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document +0.0005611348559279058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document +0.0005604238061828518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document +0.0005603301490194237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document +0.0005607291294548833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document +0.0005605234569930727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document +0.0005613778566640694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document +0.0005610248539992471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document +0.0005599977416780475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document +0.0005603632562116935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document +0.0005599177479509897 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document +0.0005595202318298379 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document +0.0005600975633499175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document +0.0005614075491213365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document +0.000612563885043477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document +0.0005515469909644413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document +0.0005526782014946906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document +0.0005472463408095445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document +0.0005502284746004587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document +0.0005414514790555363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document +0.0005513499500134784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document +0.0005391391454105187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document +0.0005415836910001838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document +0.0005208132468536551 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document +0.0005889827143132871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document +0.0005822520817765276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document +0.0004173155230758696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document diff --git a/ALCF/data-lists/sunspot/stack.txt b/ALCF/data-lists/sunspot/stack.txt new file mode 100644 index 0000000000..297783ac22 --- /dev/null +++ b/ALCF/data-lists/sunspot/stack.txt @@ -0,0 +1,26 @@ +0.0009994361338078242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document +0.001087156194657966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document +0.0010667737163656816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document +0.0009602877882124873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document +0.0008968956271971105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document +0.0009198034843762967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document +0.0009423901016715341 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document +0.0009674094553686345 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document +0.0009858331322519164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document +0.0009970593645879198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document +0.0010027035193731686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document +0.0010128291154221853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document +0.0010215631382631918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document +0.0010288663771461238 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document +0.0010346219929285867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document +0.00104544019940344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document +0.0010525172676724333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document +0.0010609529620775127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document +0.0010725892748610153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document +0.0010818563598181568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document +0.0010992760196793917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document +0.0011178992762079917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document +0.001124687532085676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document +0.001118303661267191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document +0.0010206825575416534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document +0.0005512280117499715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document diff --git a/ALCF/data-lists/sunspot/starcoder.txt b/ALCF/data-lists/sunspot/starcoder.txt new file mode 100644 index 0000000000..37e6333de5 --- /dev/null +++ b/ALCF/data-lists/sunspot/starcoder.txt @@ -0,0 +1,50 @@ +0.004474659408857016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document +0.00409944473890653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document +0.005137179939941845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document +0.005143172251066109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document +0.005206134363352808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document +0.004892747858974329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document +0.004844731352552902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document +0.005308320169123755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document +0.005124709815666577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document +0.005424710744483826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document +0.00538244648861977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document +0.0029107284679086853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document +0.0026825258998444705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document +0.0026904503191419243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document +0.002687906577174073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document +0.002850165346048818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document +0.005322698571717847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document +0.004450334290869719 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document +0.004700990083440683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document +0.003903568556500995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document +0.00390561515396931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document +0.0039046402900912262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document +0.003907454839379547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document +0.0038583224578603824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document +0.0037914116657695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document +0.003786665266798682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document +0.003792000802430658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document +0.00319266847466091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document +0.0032658716699838944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document +0.0034801959532460023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document +0.0028307012092022594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document +0.0028420360878146276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document +0.0028410455248484914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document +0.00283497183526842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document +0.002840187195459487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document +0.0028398709431369834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document +0.004364722843422023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document +0.004093255713117101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document +0.004092331079566252 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document +0.004005326985579649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document +0.0036205502856964207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document +0.003625316793034984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document +0.003604743435602363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document +0.0035405823343673125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document +0.0041601413517253945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document +0.005886303658937057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document +0.003600909532810332 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document +0.0034941365817168658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document +0.0004992164842980224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document + diff --git a/ALCF/data-lists/sunspot/tulu.txt b/ALCF/data-lists/sunspot/tulu.txt new file mode 100644 index 0000000000..2b75802501 --- /dev/null +++ b/ALCF/data-lists/sunspot/tulu.txt @@ -0,0 +1,66 @@ +0.00032927705604725614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document +0.0002860154190878753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document +0.0002845217585425619 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document +0.0002743528685497456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document +0.00026025323737738766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document +0.00023493876414603155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document +0.00029665994994226705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document +0.00031808102075993956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document +0.00031813573046011285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document +0.0002711905171855542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document +0.00028892513401817095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document +0.00030003908676979083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document +0.00026839878771944684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document +0.00029155935002690497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document +0.0002998624927624209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document +0.0003091705447974841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document +0.00026873195794309786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document +0.00027721873498527547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document +0.0002841662554024377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document +0.0002839461156551537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document +0.0002861705604659811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document +0.0002460995649635886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document +0.00019420142619795496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document +0.00021967677816173628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document +0.0002620283200480949 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document +0.0002433390542188936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document +0.00021254976608350767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document +0.00022094815569522115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document +0.000342862378668244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document +0.00033784225259118157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document +0.0003367278459543952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document +0.00029843279042852765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document +0.0002926583661257988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document +0.00029320337282010673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document +0.00029281450669483455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document +0.0002915338187002653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document +0.0002864226923084572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document +0.00028643439083586396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document +0.00028253710956299054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document +0.0002810856078805806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document +0.00031474941344656715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document +0.0002139130222205655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document +0.0003084648871862831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document +0.0003309477872140129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document +0.0003360096824695161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document +0.0003355452655196557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document +0.00038119390366386037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document +0.00038078927630086064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document +0.0003386200917551554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document +0.0002158905159938882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document +0.00021621682877018768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document +0.00021553306942740535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document +0.00021581563462722296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document +0.0002157694110556169 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document +0.000215643699847159 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document +0.00021532716715168094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document +0.00021531221326022472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document +0.0002831801179028896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document +0.0002514844936507595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document +0.00031638782778107964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document +0.0002749197545278445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document +0.00026159721512464495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document +0.0002630052420096968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document +0.00031106811228913666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document +0.0002852973415334161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document +3.7555372465932136e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document diff --git a/ALCF/data-lists/sunspot/wiki.txt b/ALCF/data-lists/sunspot/wiki.txt new file mode 100644 index 0000000000..52af00d57b --- /dev/null +++ b/ALCF/data-lists/sunspot/wiki.txt @@ -0,0 +1,2 @@ +0.003548077173506675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document +0.0018372203137874265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document diff --git a/ALCF/fused_stackcode.py b/ALCF/fused_stackcode.py new file mode 100644 index 0000000000..28c1e5c694 --- /dev/null +++ b/ALCF/fused_stackcode.py @@ -0,0 +1,36 @@ +import os +from os import system +import glob +import json +import gzip +import pdb + +def list_json_gz_files(directory): + # Create the search pattern for JSON.gz files + search_pattern = os.path.join(directory, "**/*.json.gz") + + # Use glob to find all files matching the pattern + json_gz_files = glob.glob(search_pattern, recursive=True) + + return json_gz_files + +def combine_json_gz_files(json_gz_files, output_file): + in_list = "" + for i in json_gz_files: + in_list = in_list + " " +str(i) + command = "cat" + in_list + " > " + output_file + print(command) + system(command) + print("done") + +directory_path = "./data/stack-code/" +folder_count = 0 +for folder in os.listdir(directory_path): + print(f"working for folder {folder} {os.path.join(directory_path, folder)}") + folder_count = folder_count + 1 + json_gz_files = list_json_gz_files(os.path.join(directory_path, folder)) + out_path = os.path.join("./fused_stack", folder) + os.makedirs(out_path, exist_ok=True) + output_file = os.path.join(out_path, 'fused.json.gz') + combine_json_gz_files(json_gz_files, output_file) + diff --git a/ALCF/fused_stackcode_bysize.py b/ALCF/fused_stackcode_bysize.py new file mode 100644 index 0000000000..d838369e6b --- /dev/null +++ b/ALCF/fused_stackcode_bysize.py @@ -0,0 +1,64 @@ +import os +from os import system +import glob +import json +import gzip +import pdb + +def list_json_gz_files(directory): + # Create the search pattern for JSON.gz files + search_pattern = os.path.join(directory, "**/*.json.gz") + + # Use glob to find all files matching the pattern + json_gz_files = glob.glob(search_pattern, recursive=True) + + return json_gz_files + +def combine_json_gz_files(json_gz_files, output_file): + in_list = "" + for i in json_gz_files: + in_list = in_list + " " +str(i) + command = "cat" + in_list + " > " + output_file + print(command) + system(command) + print("done ?") + +directory_path = "./fused_stack/" +out_path = "./fused_by_size" +folder_count = 0 +file_list = list_json_gz_files(directory_path) +size_dict = {} +for efile in file_list: + size_of_files = os.stat(efile) + size_dict[efile] = size_of_files.st_size / (1024*1024) # in MBs + +sorted_size_dict = dict(sorted(size_dict.items(), key=lambda item: item[1])) +vol = 0 +sublist = [] +super_list = {} +i=1 +for key, val in sorted_size_dict.items(): + if vol + val > 4608: + # add this item to list and reset vol, sublist + vol = 0 + sublist.append(key) + #print(sublist) + print("************") + super_list[i] = sublist + output_file = out_path + "/fused_stack_" + str(i) + ".json.gz" + print(output_file) + combine_json_gz_files(sublist, output_file) + sublist = [] + i=i+1 + else: + vol = vol + val + sublist.append(key) +#print(t) +#for folder in os.listdir(directory_path): +# print(f"working for folder {folder} {os.path.join(directory_path, folder)}") +# folder_count = folder_count + 1 +# json_gz_files = list_json_gz_files(os.path.join(directory_path, folder)) +# out_path = os.path.join("./fused_stack", folder) +# os.makedirs(out_path, exist_ok=True) +# output_file = os.path.join(out_path, 'fused.json.gz') +# combine_json_gz_files(json_gz_files, output_file) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh new file mode 100644 index 0000000000..c0927766f2 --- /dev/null +++ b/ALCF/helpers.sh @@ -0,0 +1,984 @@ +#!/bin/bash --login +# +# set -euxo pipefail + +if [[ -n "${PBS_O_WORKDIR}" ]]; then + WORKING_DIR="${PBS_O_WORKDIR}" +elif [[ -n "${SLURM_SUBMIT_DIR}" ]]; then + WORKING_DIR="${SLURM_SUBMIT_DIR}" +else + echo "Unable to detect PBS or SLURM working directory info..." + WORKING_DIR=$(python3 -c 'import os; print(os.getcwd())') + echo "Using ${WORKING_DIR} as working directory..." +fi + +export WORKING_DIR="${WORKING_DIR}" +printf "Using WORKING_DIR: %s\n" ${WORKING_DIR} + + +save_dotenv() { + if [[ "$#" -ne 1 ]]; then + estr="[error]" + # echo "Expected exactly one argument, specifying outputdir. Received $#" + printf "%s Expected one argument (outdir). Received: %s" "$(printRed ${estr})" "$#" + else + outdir="$1" + mkdir -p "${outdir}" + module list + dotenv_file="${outdir}/.env" + echo "Saving environment to ${dotenv_file}" + printenv | grep -v "LS_COLORS" > "${dotenv_file}" + export DOTENV_FILE="${dotenv_file}" + fi +} + + +where_am_i() { + if [[ $(hostname) == x4* ]]; then + machine="aurora" + elif [[ $(hostname) == x1* ]]; then + machine="sunspot" + elif [[ $(hostname) == x3* ]]; then + if [[ "${PBS_O_HOST}" == sirius* ]]; then + machine="sirius" + else + machine="polaris" + fi + elif [[ $(hostname) == nid* ]]; then + machine="perlmutter" + else + machine=$(hostname) + fi + echo "${machine}" +} + +get_machine() { + if [[ $(hostname) == x4* ]]; then + machine="aurora" + elif [[ $(hostname) == x1* ]]; then + machine="sunspot" + elif [[ $(hostname) == x3* ]]; then + if [[ "${PBS_O_HOST}" == sirius* ]]; then + machine="sirius" + else + machine="polaris" + fi + elif [[ $(hostname) == nid* ]]; then + machine="perlmutter" + else + echo "Unknown MACHINE. Setting MACHINE to $(hostname) and continuing..." + fi + export MACHINE="${machine}" + printf "Running on: %s\n" "$(printBlue ${MACHINE})" +} + + +check_and_kill_if_running() { + # kill $(ps aux | grep -E "$USER.+(mpi|main.py)" | grep -v grep | awk '{print $2}') + RUNNING_PIDS=$(lsof -i:29500 -Fp | head -n 1 | sed 's/^p//') + if [[ -n "${RUNNING_PIDS}" ]]; + then echo "Caught ${RUNNING_PIDS}" && kill "${RUNNING_PIDS}"; + else + echo "Not currently running. Continuing!" + fi +} + + +setupSrun() { + if [[ $(hostname) == login* || $(hostname) == nid* ]]; then + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + export SRUN_EXEC="srun --gpus ${NGPUS} --gpus-per-node ${NGPU_PER_HOST} -N ${NHOSTS} -n ${NGPUS} -l -u --verbose" + else + echo "Skipping setupSrun() on $(hostname)" + fi +} + + +printJobInfo() { + echo "++++++++++++++++++++++++++++++++++++++++++++++++++" + echo "- MPICH_DIR=${MPICH_DIR:-${MPI_ROOT}}" + echo "- Using $(which python3)" + echo "- WORLD_SIZE:${WORLD_SIZE}" + echo "- NCCL: ${NCCL:-nccl}" + echo "- MODEL_TYPE: ${MODEL_TYPE}" + echo "- Using DATA_FILE_LIST: ${DATA_FILE_LIST}" + echo "++++++++++++++++++++++++++++++++++++++++++++++++++" +} + +setupVenv() { + VENV_DIR="$1" + if [[ -d "${VENV_DIR}" ]]; then + echo "Found venv at: ${VENV_DIR}" + source "${VENV_DIR}/bin/activate" + else + echo "Skipping setupVenv() on $(hostname)" + fi +} + +loadCondaEnv() { + if [[ "${CONDA_EXE}" ]]; then + echo "Already inside ${CONDA_EXE}, exiting!" + else + MODULE_STR="$1" + module load "conda/${MODULE_STR}" + nargs="$#" + if [[ "${nargs}" -ge 2 ]]; then + conda activate "$2" + else + conda activate base + fi + fi +} + + +setupLauncher() { + # outdir=$1 + if [[ "${LAUNCH_CMD:-"MPICH"}" == "deepspeed" ]]; then + # Assert `./hostfile_deepspeed` exists + export hfds="${WORKING_DIR}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit + export LAUNCHER="deepspeed --hostfile $hfds --launcher MPICH ${EXEC}" + # if [[ -n "${DIST_LAUNCH}" && ${LAUNCH_CMD:-"MPICH"} != "deepspeed" ]]; then + else + export LAUNCHER="${DIST_LAUNCH} --genvall --cpu-bind depth -d 16 $(which python3) -Wignore ${EXEC}" + fi + printf "Launching with: %s\n" "$(printRed "${LAUNCH_CMD}")" + printf " %s" "$(printMagenta ${LAUNCHER})" +} + +setDSlauncher() { + # launcher setting + outdir=$1 + export hfds="$outdir/hostfile_deepspeed" + export hfmpi="$outdir/hostfile_mpich" + [ -f "$hfds" ] || exit + [ -f "$hfmpi" ] || exit + export LAUNCHER=${LAUNCHER:-MPICH} + if [[ $LAUNCHER == "deepspeed" ]]; then + export launcher="" + else + export launcher="--force_multi --hostfile $hfds --launcher=${LAUNCHER} --launcher_args='-hostfile ${hfmpi}'" + fi +} + +set_lr_args() { + LR_ARGS="--lr ${LR} --lr-decay-style cosine" + if [[ -n "${LR_DECAY_ITERS:-}" ]]; then + LR_ARGS="${LR_ARGS} --lr-decay-iters ${LR_DECAY_ITERS}" + fi + if [[ -n "${LR_WARMUP_FRAC}" ]]; then + LR_ARGS="${LR_ARGS} --lr-warmup-fraction ${LR_WARMUP_FRAC}" + fi + echo "LR_ARGS: ${LR_ARGS}" + export LR_ARGS="${LR_ARGS}" +} + + +get_batch_size_on_polaris() { + if [[ $(hostname) == x3* ]]; then + local nhosts=$(wc -l < "${PBS_NODEFILE}") + if [[ "${nhosts}" == 1 || "${nhosts}" == 2 ]]; then + mbs=1 + elif [[ "${nhosts}" -ge 3 ]]; then + mbs=2 + elif [[ "${nhosts}" -ge 8 ]]; then + mbs=4 + fi + fi + echo "${mbs}" +} + +setParams() { + LLAMA_ARGS="" + # +----[Parallelism Settings] -------------------------------------------+ + # +------[Aurora]--------||-------[SunSpot]-------------+ + if [[ $(hostname) == x4* || $(hostname) == x1* ]]; then + TP=${TP:-1} # TP = 1 + export CCL=${CCL:-ccl} # CCL + export BE="${CCL}" # COMMUNICATION BACKEND = CCL + export DTYPE=${DTYPE:-bf16} # DTYPE: bf16 + export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-8} # GRADIENT_ACC_STEPS + MICRO_BATCH=${MICRO_BATCH:-4} # MICRO_BATCH = 4 + ############################################################## + # NOTE: if NO_FLASH_ATTN is NON-empty; then NO FLASH ATTN !! + if [[ -n "${NO_FLASH_ATTN-}" ]]; then + echo "Not using flash-attn!!" + else + LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn-builder" + fi + ############################################################## + # +--------[Polaris]-----------------------------------+ + elif [[ $(hostname) == x3* ]]; then + # export LAUNCH_CMD="${LAUNCH_CMD:-deepspeed}" + TP=${TP:-1} # TP = 2 + export NCCL=${NCCL:-nccl} # NCCL + export BE="${NCCL}" # BE = NCCL + # export DTYPE=${DTYPE:-bf16} # DTYPE: BF16 ?? + export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 + export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-8} # GRADIENT_ACC_STEPS + # NOTE: MICRO_BATCH is exported below + # MICRO_BATCH=${MICRO_BATCH:-2} # MICRO_BATCH = 8 + export MICRO_BATCH="${MICRO_BATCH:-$(get_batch_size_on_polaris)}" + if [[ -n "${NO_FLASH_ATTN-}" ]]; then + echo "Not using flash-attn!!" + else + LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn-v2" + fi + echo "Setting up AWS NCCL OFI Plugin on Polaris..." + source "${WORKING_DIR}/ALCF/aws_ofi_nccl_plugin.sh" || exit + # +--------[Perlmutter]---------------------------------+ + elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then + TP="${TP:-2}" + export NCCL="${NCCL:-nccl}" + export BE="${NCCL}" + export DTYPE="${DTYPE:-bf16}" + MICRO_BATCH="${MICRO_BATCH:-8}" + if [[ -n "${NO_FLASH_ATTN-}" ]]; then + echo "Not using flash-attn!!" + else + LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn-v2" + fi + fi + # +----------------------------------------------------------------------+ + export TP="${TP}" + export PP="${PP:-1}" + export DTYPE="${DTYPE:-bf16}" + export OPT="${OPT:-adamw}" + export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" + NHOSTS=$(wc -l < "${HOSTFILE}") + if [[ -z "${NGPU_PER_HOST-}" ]]; then + NGPU_PER_HOST=$(python3 -c 'import ezpz as ez; print(ez.get_gpus_per_node())') + fi + export WORLD_SIZE="${WORLD_SIZE:-$(( NHOSTS * NGPU_PER_HOST ))}" + # +---[Llama2 7B Config]--------------------------------------------------+ + export MODEL_KEY="Llama-7B" + export HEADS=${HEADS:-${NHEADS:-32}} # NUMBER OF ATEN HEADS + export NLAYERS=${NLAYERS:-${NUM_LAYERS:-32}} # NUMBER OF LAYERS + export HIDDEN=${HIDDEN:-4096} # HIDDEN SIZE + export NUM_KV_HEAD=${NUM_KV_HEAD:-8} # GROUP ATTENTION + export FFN_HIDDEN_SIZE=${FFN_HIDDEN_SIZE:-11008} # FFN HIDDEN SIZE + # +---[Run Settings]------------------------------------------------------+ + export SEQ=${SEQ:-4096} # SEQ_LEN: 4096 + export ZERO_STAGE=${ZERO_STAGE:-1} # ZERO OFFLOADING STAGE + export MICRO_BATCH=${MICRO_BATCH:-8} # MICRO BATCH SIZE + export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} # GRADIENT ACCUMULATION STEPS + export EVAL_ITERS="${EVAL_ITERS:-10}" # NUMBER OF EVAL ITERS TO RUN + export TRAIN_ITER=${TRAIN_ITER:-317892} # NUMBER OF TRAIN ITERS + export EVAL_INTERVAL="${EVAL_INTERVAL:-50000}" # HOW FREQUENTLY TO RUN EVAL + export SAVE_INTERVAL=${SAVE_INTERVAL:-200} # HOW FREQUENTLY TO SAVE CKPTS + export TIMING_LOG_LEVEL="${TIMING_LOG_LEVEL:-1}" # TIMING VERBOSITY IN LOGS + export ACT_CKPT_NUM_LAYERS="${ACT_CKPT_NUM_LAYERS:-1}" # NUM LAYERS TO CHECKPOINT ACTIVATIONS + export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} # USE ACTIVATION CHECKPOINTING ? + export GLOBAL_BATCH_MAX=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) # MAX GLOBAL BATCH SIZE + export GLOBAL_BATCH="${GLOBAL_BATCH:-${GLOBAL_BATCH_MAX}}" # WILL USE MAX IF NOT SET IN ENVIRONMENT + # tm="${WORKING_DIR}/ALCF/tokenizer.model" # fallback: Megatron-DeepSpeed/ALCF/tokenizer.model + # export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" # USE TOKENIZER_MODEL from env, else fallback from ^ + export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" # STRING FOR IDENTIFYING MODEL + # +----[ADDITIONAL LLAMA SPECIFIC ARGUMENTS]------------------------------ + export LLAMA_ARGS="${LLAMA_ARGS} --no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" + export LR=${LR:-0.0003} # LEARNING_RATE + export LR_WARMUP_FRAC=${LR_WARMUP_FRAC:-0.05} # LEARNING RATE WARMUP + # export LR_DECAY_ITERS=${LR_DECAY_ITERS:-320000} # LR DECAY ITERS + export LR_DECAY_ITERS=${LR_DECAY_ITERS:-} # LR DECAY ITERS + set_lr_args + if [[ "${TIMING_LOG_LEVEL}" -ge 1 ]]; then + TIMING_STR="\ + --timing-log-level ${TIMING_LOG_LEVEL} \ + --log-timers-to-tensorboard \ + --log-optimizer-states-to-tensorboard \ + " + else + TIMING_STR="" + fi +} + + +setArgs() { + # ---- Set DeepSpeed arguments -------------------------------- + ds_args=" " + ds_args=" --deepspeed ${ds_args}" + if [[ $PP == 1 ]]; then + ds_args=" --no-pipeline-parallel ${ds_args}" + fi + ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" + ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" + if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" + ds_args=" --deepspeed-activation-checkpointing ${ds_args}" + # --checkpoint-activations \ + # --deepspeed-activation-checkpointing + fi + export ds_args + # --------------------------------------------------------------- + gpt_args=() + # we are now using activation checkpoint provided by megatron, see below. + # ds_args=" --deepspeed-activation-checkpointing ${ds_args}" + if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" + gpt_args+=( + "--checkpoint-activations" + "--checkpoint-num-layers ${ACT_CKPT_NUM_LAYERS}" + ) + fi + export gpt_args +} + + +make_ds_hostfile() { + export GPUS_PER_NODE="${GPUS_PER_NODE:-${NGPU_PER_HOST:-${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}}}" + # ---- Make MPICH hostfile ---------------- + hf="${HOSTFILE:-${PBS_NODEFILE}}" + export hostfile_mpich=hostfile_mpich + cat "${hf}" > "${hostfile_mpich}" + # ---- Make DeepSpeed hostfile ------------------- + export hostfile_deepspeed=hostfile_deepspeed + cat "${hf}" > "${hostfile_deepspeed}" + sed -e "s/$/ slots=${GPUS_PER_NODE}/" -i "${hostfile_deepspeed}" +} + +# +---------------------------------------+ +# | 1. Git clone ezpz (if not found) | +# | 2. Install ezpz (if not installed) | +# +---------------------------------------+ +ezpz() { + if [[ ! -d "${WORKING_DIR}/deps/ezpz" ]]; then + mkdir -p "${WORKING_DIR}/deps" + git clone https://github.com/saforem2/ezpz "${WORKING_DIR}/deps/ezpz" + else + echo "Found ezpz!" + fi + ezloc=$(python3 -m pip list | grep ezpz | awk '{print $NF}') + if [[ -n "${ezloc}" ]]; then + echo "ezpz detected. Sourcing ${ezloc}/bin/savejobenv" + source "${ezloc}/src/ezpz/bin/savejobenv" > /dev/null 2>&1 + source "${ezloc}/src/ezpz/bin/getjobenv" || exit + make_ds_hostfile || exit + else + echo "No ezpz detected. Attempting to install with $(which python3)" + python3 -m pip install -e "${WORKING_DIR}/deps/ezpz" --require-virtualenv + fi + echo "Done with ezpz." + # echo "Done with clone. Now, checking if ezpz is installed..." + # if python3 -c 'import ezpz; print(ezpz.__file__)' 2> '/dev/null'; then + # if [[ $(python3 -c "import sys; any(['ezpz' in s for s in sys.path])") 2> '/dev/null' ]]; then + # echo "Has ezpz installed. Nothing to do." + # else + # echo "Does not have ezpz installed. Installing..." + # echo "Using $(which python3) to install ezpz:" + # python3 -m pip install -e "${WORKING_DIR}/deps/ezpz" --verbose --require-virtualenv # > ezpz-install.log 2>&1 + # fi + # python3 -m pip install -e "${WORKING_DIR}/deps/ezpz" --verbose --require-virtualenv + # # echo "Done with ezpz." + # source ${WORKING_DIR}/deps/ezpz/src/ezpz/bin/savejobenv > /dev/null 2>&1 #> /tmp/savejobenv.log 2>&1 || exit + # source ${WORKING_DIR}/deps/ezpz/src/ezpz/bin/getjobenv || exit + # make_ds_hostfile || exit +} + +# +------------------------------------------------------------------------+ +# | Save important environment variables to .deepspeed_env, which will be | +# | forwarded to ALL ranks with DeepSpeed | +# +------------------------------------------------------------------------+ +saveDSenv() { + echo "Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env" + { + echo "PATH=${PATH}" ; + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" ; + echo "http_proxy=${http_proxy}" ; + echo "https_proxy=${https_proxy}" ; + echo "CFLAGS=${CFLAGS}" ; + echo "PYTHONUSERBASE=$PYTHONUSERBASE" ; + } > .deepspeed_env +} + + +get_output_prefix() { + # ---- Specify output location -------------------------------- + pre="ws${WORLD_SIZE}_ds_stage${ZERO_STAGE}_nl${NLAYERS}" + pre="${pre}_hs${HIDDEN}_mb${MICRO_BATCH}" + pre="${pre}_seq${SEQ}_gb${GLOBAL_BATCH}" + pre="${pre}_pp${PP}_tp${TP}_${DTYPE}_opt${OPT}" + pre="${pre}_lr${LR}_lwf${LR_WARMUP_FRAC}" + if [[ -n "${LR_DECAY_ITERS}" ]]; then + pre="${pre}_ldi${LR_DECAY_ITERS}" + fi + if [[ -z "${NO_FLASH_ATTN:-}" ]]; then + pre="${pre}_flash" + fi + export OUTPUT_PREFIX="${pre}" + echo "${pre}" +} + +setOutput() { + # OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" + OUTPUT_PREFIX=$(get_output_prefix) + OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%Y%m%d-%H%M%S)_${WORLD_SIZE}_${HOSTNAME}" + export OUTPUT_DIR="${OUTPUT_DIR}" + export OUTPUT_LOG="${OUTPUT_DIR}/output.log" + export CKPT_DIR="checkpoints/${OUTPUT_PREFIX}" + echo "${OUTPUT_LOG}" >> "logs/latest" + mkdir -p "${OUTPUT_DIR}" + printf "\n Please see logs at: %s\n" $(printGreen "${OUTPUT_DIR}") + printf "Checkpoints will be saved to: %s\n" $(printYellow "${CKPT_DIR}") +} + +buildDSconfig() { + # ---- Build DeepSpeed Config --------------------------------- + export CPU_OPTIMIZER="${CPU_OPTIMIZER:-0}" + export DS_CONFIG="${WORKING_DIR}/ds-configs/ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" + mkdir -p $(dirname "${DS_CONFIG}") + echo "DS_CONFIG: ${DS_CONFIG}" + printf "ZS: %s, , MB: %s, GB: %s, PP: %s, DTYPE: %s" "${ZERO_STAGE}" "${CPU_OPTIMIZER}" "${MICRO_BATCH}" "${GLOBAL_BATCH}" "${PP}" "${DTYPE}" + # working_dir="${PBS_O_WORKDIR:-${SLURM_SUBMIT_DIR:-$(pwd)}}" + generateDSconfig "${DS_CONFIG}" + # bash "${WORKING_DIR}/ALCF/generate_ds_config.sh" "${DS_CONFIG}" + # ------------------------------------------------------------- +} + + +sumWeights() { + local file_list=$1 + weights=$(cat "${file_list}" | awk '{print $1}' | tr '\n' '\ ,\ ' | sed 's/^/[/g' | sed 's/$/]/g' | tr '\ ' "\,\ ") + python3 -c "import numpy as np; print(np.sum(${weights}))" +} + +sumFiles() { + local rd=$1 + for f in $("${rd}/*.txt"); do + ws=$(sumWeights "${rd}/${f}") + echo "sum($f.weights)=${ws}" + done +} + +######################################################## +# Setup / activate conda environment, +# NOTE: +# +# Jerome's `frameworks_2024_5_v2` seems broken ?? +# - seems to be missing `python3 -c 'from mpi4py import MPI'` ??? +# - consequently, we leave the setup below commented out (for the time +# being): +# if [[ -z "${CONDA_PREFIX-}" ]]; then +# module use -a /home/jmitche1/anl_release/2024/q2 ; module load frameworks_2024_5_v2 +# else +# echo "Caught CONDA_PREFIX=${CONDA_PREFIX}" +# fi +######################################################## +setup_conda_sunspot() { + ###### check if CONDA_PREFIX non-empty ################ + if [[ -z "${CONDA_PREFIX:-}" ]]; then + module use /soft/preview-modulefiles/24.086.0 ; module load frameworks/2024.04.15.002.lua + fi +} + +######################## +# Setup conda on Sirius +######################## +setup_conda_sirius() { + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then + export MAMBA_ROOT_PREFIX=/lus/tegu/projects/PolarisAT/foremans/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook --shell ${shell_name})" + micromamba activate 2024-04-23 + else + echo "Found existing python at: $(which python3)" + fi +} + +######################## +# Setup conda on Polaris +######################## +setup_conda_polaris() { + # unset MPICH_GPU_SUPPORT_ENABLED + ###### check if CONDA_PREFIX non-empty ################ + if [[ -z "${CONDA_PREFIX-}" ]]; then + # if so, load the default conda/2024-04-29 + # module and activate base environment + module use /soft/modulefiles ; module load conda ; conda activate base + else + echo "Caught CONDA_PREFIX=${CONDA_PREFIX}" + fi +} + +setup_venv_from_conda() { + if [[ -z "${CONDA_PREFIX}" ]]; then + echo "No ${CONDA_PREFIX} found." # Exiting." + # exit 1 + else + if [[ -n "${VIRTUAL_ENV}" ]]; then + echo "Already inside virtual env at ${VENV_DIR}!" + elif [[ -z "${VIRTUAL_ENV}" ]]; then + echo "No VIRTUAL_ENV found in environment!" + echo " - Trying to setup from ${CONDA_PREFIX}" + CONDA_NAME=$(echo ${CONDA_PREFIX} | tr '\/' '\t' | sed -E 's/mconda3|\/base//g' | awk '{print $NF}') + VENV_DIR="${WORKING_DIR}/venvs/${CONDA_NAME}" + echo " - Using VENV_DIR=${VENV_DIR}" + # VENV_DIR="venvs/$(echo ${CONDA_PREFIX} | tr '\/' '\t' | sed -E 's/mconda3|\/base//g' | awk '{print $NF}')" + # VENV_DIR="${WORKING_DIR}/venvs/$(echo ${CONDA_PREFIX} | tr '\/' '\t' | awk '{print $NF}')" + # VENV_DIR="${WORKING_DIR}/venvs/anl_24_q2_release" + # if [[ -f "${VENV_DIR}/bin/activate" ]]; then + if [[ ! -f "${VENV_DIR}/bin/activate" ]]; then + printf "\n - Creating a new virtual env on top of %s in %s" "$(printBlue "${CONDA_NAME}")" "$(printGreen "${VENV_DIR}")" + mkdir -p "${VENV_DIR}" + python3 -m venv "${VENV_DIR}" --system-site-packages + source "${VENV_DIR}/bin/activate" || exit + elif [[ -f "${VENV_DIR}/bin/activate" ]]; then + echo " - Found existing venv, activating from $(printBlue "${VENV_DIR}")" + source "${VENV_DIR}/bin/activate" + else + printf "\n [!! %s]: Unable to locate %s\n" "$(printRed "ERROR")" "$(printMagenta "${VENV_DIR}/bin/activate")" + fi + fi + # else + # printf "[!! %s]: Unable to locate %s\n" "$(printRed "ERROR")" "$(printMagenta "${VENV_DIR}/bin/activate")" + fi + +} + +########################################################## +# Check that we can find the `.py` file we wish to launch +########################################################## +check_executable() { + fp=$1 + if [[ -f "${fp}" ]]; then + export EXEC="${EXEC}" + # ----[1.5 Keep track of stem from file path]------------------------- + exec_stem=$(echo "${EXEC}" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.py//g") + export EXEC_STEM="${exec_stem}" + else + estr="Unable to locate executable ${fp}" + printf "[ALCF.helpers:check_executable] %s" "$(printRed ${estr})" + fi +} + + + +setEnv() { + local virtual_env="${VIRTUAL_ENV:-}" + local conda_prefix="${CONDA_PREFIX:-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No virtual environment found." + echo "Using conda from: ${conda_prefix}" + echo "Setting up venv from ${CONDA_PROMPT_MODIFIER:-}" + setup_venv_from_conda + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "No conda found." + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of conda from: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda..." + ######################## setup_conda ############################ + # ---- [SunSpot @ ALCF] || [Aurora @ ALCF] --------------------- + if [[ $(hostname) == x1* || $(hostname) == x4* ]]; then + # ----- [Aurora] -------------------------------------------- + if [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + if [[ $(hostname) == x4* ]]; then + # TODO: Update once Aurora back online + eval "$(conda shell.zsh hook)" && conda activate anl_release_q4v2 + # ----- [SunSpot] --------------------------------------- + elif [[ $(hostname) == x1* ]]; then + echo "Running on SunSpot !!" + setup_conda_sunspot + fi + fi + # MPICH_MODULES=$(echo $LOADEDMODULES | tr ':' '\n' | grep mpich) + # if [[ -z "${MPICH_MODULES" ]]; then + # source "${WORKING_DIR}/ALCF/sunspot-env.sh" || exit + # else + # echo "Caught MPICH_MODULES: ${MPICH_MODULES}" + # fi + # ----- [Polaris @ ALCF] -------------------------------------------- + elif [[ $(hostname) == x3* ]]; then + if [[ "${PBS_O_HOST}" == sirius* ]]; then + echo "Running on Sirius !!" + setup_conda_sirius + else + echo "Running on Polaris !!" + # ---- [load conda] ------------------------------------- + setup_conda_polaris + fi + # ----- [Perlmutter @ NERSC] ---------------------------------------- + elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then + echo "Running on Perlmutter !!" + module load pytorch + source "${SLURM_SUBMIT_DIR}/venvs/perlmutter/pytorch-2.1.0-cu12/bin/activate" + else # ------------------------------------- [Unknown] ------------------- + echo "Unknown hostname $(hostname)" + exit 1 + fi + else + echo "Unable to setup python environment. Exiting" + exit 1 + fi + if [[ -z "${virtual_env}" ]]; then + setup_venv_from_conda + fi + ##################################################################### + pystr="Using: $(which python3)" + printf "[python] %s" "$(printMagenta ${pystr})" + printf "\n" + export "PYTHON_EXEC=$(which python3)" +} + + +###################################################################### +# `makeHostiles`: +# Detect if `HOSTFILE` set in active environment. +# - If so, use this. +# - Otherwise, make default HOSTFILEs from "${PBS_NODEFILE}" +###################################################################### +makeHostfiles() { + if [[ -n "${HOSTFILE}" ]]; then + printf "!! USING CUSTOM HOSTFILE FROM: %s" "${HOSTFILE}" + else + make_ds_hostfile + fi +} + +################################################## +# Setup tokenizer as either Llama2 or GPT2 style +################################################## +setup_tokenizer_and_data() { + if [[ "$#" == 1 ]]; then + tok="$1" + dfl="${DATA_FILE_LIST:-}" + elif [[ "$#" == 2 ]]; then + tok="$1" + dfl="$2" + else + echo "Incorrect number of arguments passed. Received: $#, expected 2" + fi + echo "Setting up tokenizer with ${tok}" + echo "Using data_file_list: ${dfl}" + if [[ ${tok} == gpt* || ${tok} == GPT* ]]; then + export TOKENIZER_TYPE="GPT2" + export TOKENIZER_FLAGS="--tokenizer-type GPT2BPETokenizer" + local machine=$(where_am_i) + if [[ ${machine} == "polaris" ]]; then + export DATA_PARENT="${DATA_PARENT:-/eagle/argonne_tpc/foremans/projects/argonne-lcf/Megatron-DeepSpeed/dataset}" + elif [[ ${machine} == "sunspot" ]]; then + export DATA_PARENT="${DATA_PARENT:-/gila/Aurora_deployment/foremans/anl_24_q2_release/Megatron-DeepSpeed/dataset}" + else + export DATA_PARENT="${DATA_PARENT:-${WORKING_DIR}/dataset}" + fi + export VOCAB_FILE="${DATA_PARENT}/gpt2-vocab.json" + export MERGE_FILE="${DATA_PARENT}/gpt2-merges.txt" + export DATA_PATH="${DATA_PARENT}/BookCorpusDataset_text_document" + export DATA_FLAGS="--data-path ${DATA_PATH} --vocab-file ${VOCAB_FILE} --merge-file ${MERGE_FILE}" + else + export DATA_FLAGS="" + export TOKENIZER_TYPE="Llama2" + tm="${WORKING_DIR}/ALCF/tokenizer.model" # fallback: Megatron-DeepSpeed/ALCF/tokenizer.model + export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" # USE TOKENIZER_MODEL from env, else fallback from ^ + export TOKENIZER_FLAGS="--tokenizer-type Llama2Tokenizer --tokenizer-model ${TOKENIZER_MODEL}" + if [[ "${TOKENIZER_TYPE}" != "GPT2" ]]; then + echo "Using tokenizer: ${TOKENIZER_TYPE}. Setting up data with ${DATA_FILE_LIST-}" + setData "${dfl}" || exit + fi + fi + printf "[setData] DATA_FLAGS: %s\n" "$(printGreen ${DATA_FLAGS})" + printf "[setData] TOKENIZER_FLAGS: %s\n" "$(printMagenta ${TOKENIZER_FLAGS})" +} + + +############################################### +# `setData`: +# Ensure `DATA_FILE_LIST` is set, +# fallback to default values if necessary. +############################################### +setData() { # ------------------------[dfl: abbrv. for DATA_FILE_LIST] + # if [[ "$#" -ne 1 ]]; then + # tok="${TOKENIZER_TYPE:-Llama2}" + # else + # tok="$1" + # fi + # echo "Setting up tokenizer with ${tok}" + # setup_tokenizer "${tok}" + # tok="${TOKENIZER_TYPE:-}" + # if [[ ${tok} == gpt* || ${tok} == GPT* ]]; then + # export TOKENIZER_TYPE="GPT2" + # export DATA_PARENT="${DATA_PARENT:-/gila/Aurora_deployment/foremans/anl_24_q2_release/Megatron-DeepSpeed/dataset}" + # export VOCAB_FILE="${DATA_PARENT}/gpt2-vocab.json" + # export MERGE_FILE="${DATA_PARENT}/gpt2-merges.txt" + # export DATA_PATH="${DATA_PARENT}/BookCorpusDataset_text_document" + # # TOKENIZER_FLAGS="--data-path $DATA_PATH--vocab-file $VOCAB_FILE --merge-file ${MERGE_FILE}" + # DATA_FLAGS="--data-path ${DATA_PATH} --vocab-file ${VOCAB_FILE} --merge-file ${MERGE_FILE}" + # # export TOKENIZER_TYPE="${TOKENIZER_TYPE:-GPT2}" + # # else [[ ${tok} == Llama* || ${tok} == llama* || ${tok} == LLAMA* ]]; then + # else + # export TOKENIZER_TYPE="Llama2" + # tm="${WORKING_DIR}/ALCF/tokenizer.model" # fallback: Megatron-DeepSpeed/ALCF/tokenizer.model + # export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" # USE TOKENIZER_MODEL from env, else fallback from ^ + # TOKENIZER_FLAGS="${TOKENIZER_FLAGS} --tokenizer-type Llama2Tokenizer" + # dfldir="${WORKING_DIR}/ALCF/data-lists" + # =====[Set DATA_FILE_LIST_FALLBACK based on current machine]============== + if [[ $(hostname) == x4* ]]; then # -----------------------------[AURORA] + dfl_fallback="/home/foremans/anl_24_release_q4/llm.devkit/Megatron-DeepSpeed/data_file_list_reweighted.txt" + elif [[ $(hostname) == x1* ]]; then # ----------------------------[SUNSPOT] + # shellcheck: source ./data-lists/sunspot/books.txt + dfl_fallback="${WORKING_DIR}/ALCF/data-lists/sunspot/books.txt" + + elif [[ $(hostname) == x3* ]]; then # -------------------[POLARIS / SIRIUS] + if [[ "${PBS_O_HOST}" == sirius* ]]; then # -------------------[SIRIUS] + # shellcheck: source ./data-lists/sirius/books.txt + dfl_fallback="${WORKING_DIR}/ALCF/data-lists/sirius/books.txt" + + elif [[ "${PBS_O_HOST}" == polaris* ]]; then # ---------------[POLARIS] + # shellcheck: source ./data-lists/polaris/books.txt + dfl_fallback="${WORKING_DIR}/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt" + fi + + elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then # [PERLMUTTER] + dfl_fallback="${SLURM_SUBMIT_DIR}/genslm-subsample.txt" + + else # -----------------------------------------------------------[UNKNOWN] + echo "Unknown hostname. Must manually specify DATA_FILE_LIST." + fi + # ========================================================================== + # set `dfl` to `dfl_fallback` if not passed as an argument, + # use this data file list to call `setData` + dfl="${1:-${dfl_fallback}}" + printf "Calling: setData() with %s\n" "${dfl}" + ndocs=$(wc -l < "${dfl}") + ws=$(sumWeights "${dfl}") + dfl_stem=$(echo "${dfl}" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") + # dcp="${OUTPUT_PREFIX:-$(get_output_prefix)}/.cache/${dfl_stem}/index-cache" + dcp=".cache/${dfl_stem}/index-cache" + export DATA_FILE_LIST="${dfl}" + export NUM_DOCS="${ndocs}" + export WEIGHT_SUM="${ws}" + export DFL_STEM="${dfl_stem}" + export DATA_CACHE_PATH="${dcp}" + export DATA_FLAGS="${DATA_FLAGS} --data-file-list ${DATA_FILE_LIST}" # --data-cache-path ${DATA_CACHE_PATH}" + echo "--------------------" + echo "Updated environment:" + printf "DATA_FILE_LIST: %s\n" "${DATA_FILE_LIST}" + printf "NUM_DOCS: %s\n " "${NUM_DOCS}" + printf "WEIGHT_SUM: %s\n" "${WEIGHT_SUM}" + printf "DFL_STEM: %s\n" "${DFL_STEM}" + printf "DATA_CACHE_PATH: %s\n" "${DATA_CACHE_PATH}" + printf "DATA_FLAGS: %s\n" "${DATA_FLAGS}" + echo "--------------------" + # fi + # export DATA_FLAGS="${DATA_FLAGS}" + # export TOKENIZER_FLAGS="${TOKENIZER_FLAGS}" + # printf "[setData] DATA_FLAGS: %s\n" "$(printGreen ${DATA_FLAGS})" + # printf "[setData] TOKENIZER_FLAGS: %s\n" "$(printMagenta ${TOKENIZER_FLAGS})" +} + +generateDSconfig() { + for v in "$GLOBAL_BATCH" "$MICRO_BATCH" "$GRAD_ACC_STEPS" "$ZERO_STAGE" \ + "$PP" "$DTYPE" + do + if [ -z $v ]; then + echo "Please export required envs before execute $0" + exit 1 + fi + done + if [ $# -ne 1 ]; then + echo "Usage: $0 config_file" + exit 1 + fi + # \"optimizer\": { + # \"type\": \"AdamW\", + # \"params\": { + # \"lr\": ${LR}, + # \"beta1\": 0.9, + # \"beta2\": 0.95, + # \"eps\": 1e-5, + # \"weight_decay\": 1e-1 + # } + # }, + # \"scheduler\": { + # \"type\": \"WarmupLR\", + # \"params\": { + # \"warmup_min_lr\": 0.00003, + # \"warmup_max_lr\": 0.0003, + # \"warmup_num_steps\": 5000 + # } + # }, + extra="" + common="\ + \"train_batch_size\": $GLOBAL_BATCH, + \"train_micro_batch_size_per_gpu\": $MICRO_BATCH, + \"steps_per_print\": 1, + \"gradient_accumulation_steps\": $GRAD_ACC_STEPS, + \"zero_allow_untested_optimizer\": true, + \"gradient_clipping\": 1.0, + \"activation_checkpointing\": { + \"partition_activations\": true, + \"contiguous_memory_optimization\": true + }, + \"wall_clock_breakdown\": false," + flops_profiler="\ + \"flops_profiler\": { + \"enabled\": true, + \"profile_step\": 2, + \"module_depth\": -1, + \"top_modules\": 1, + \"detailed\": true, + \"output_file\": null + }" + if [[ $DTYPE == "bf16" ]]; then + dtype="\ + \"communication_data_type\": \"bf16\", + \"fp16\": { + \"enabled\": false, + \"loss_scale\": 0, + \"loss_scale_window\": 1000, + \"hysteresis\": 2, + \"min_loss_scale\": 1 + }, + \"bfloat16\": { + \"enabled\": true, + \"loss_scale\": 1.0 + }," + elif [[ $DTYPE == "fp16" ]]; then + dtype="\ + \"communication_data_type\": \"fp16\", + \"fp16\": { + \"enabled\": true, + \"loss_scale\": 0, + \"loss_scale_window\": 1000, + \"hysteresis\": 2, + \"min_loss_scale\": 1 + }, + \"bfloat16\": { + \"enabled\": false, + \"loss_scale\": 1.0 + }," + else + dtype="\"communication_data_type\": \"fp32\"," + fi + if [ $ZERO_STAGE == 3 ]; then + zero="\ + \"zero_optimization\": { + \"stage\": 3, + \"reduce_scatter\": false, + \"mics_shard_size\": 4, + \"mics_hierarchical_params_gather\": true, + \"stage3_max_live_parameters\": 3e9, + \"stage3_max_reuse_distance\": 3e9, + \"stage3_param_persistence_threshold\": 1e5, + \"stage3_prefetch_bucket_size\": 5e7, + \"contiguous_gradients\": true, + \"overlap_comm\": true, + \"reduce_bucket_size\": 90000000, + \"sub_group_size\": 1e9, + \"offload_optimizer\": { + \"device\": \"none\", + \"buffer_count\": 4, + \"pipeline_read\": false, + \"pipeline_write\": false, + \"pin_memory\": true + } + }," + # elif [[ $ZERO_STAGE == 2 ]]; then + elif [ "${ZERO_STAGE}" == 2 ] || [ "${ZERO_STAGE}" == 1 ]; then + # if [[ -n "${CPU_OPTIMIZER}" ]]; then + if [[ "${CPU_OPTIMIZER}" != 0 ]]; then + echo "!!!! CAUGHT CPU_OPTIMIZER !!!!" + zero="\ + \"zero_optimization\": { + \"stage\": $ZERO_STAGE, + \"offload_optimizer\": { + \"device\": \"cpu\" + } + }," + else + zero="\ + \"zero_optimization\": { + \"stage\": $ZERO_STAGE + }," + fi + # elif [[ $ZERO_STAGE == 1 ]]; then + if [[ $PP > 1 ]]; then + extra="\ + \"data_types\": { + \"grad_accum_dtype\": \"fp32\" + }, + \"comms_logger\": { + \"enabled\": true, + \"verbose\": false, + \"prof_all\": true, + \"debug\": false + }," + else + # echo 'please add the config for zero_stage 1 without pipeline-parallelism' + extra="\ + \"comms_logger\": { + \"enabled\": true, + \"verbose\": false, + \"prof_all\": true, + \"debug\": false + }," + fi + else + echo 'Please add the correct config set!!!' + fi +# flops_profiler must at the end because no ',' is allowed at the end +cat < $1 +{ +$common +$zero +$dtype +$extra +$flops_profiler +} +EOT +} + +printBlack() { + printf "\e[1;30m%s\e[0m\n" "$@" +} + +printRed() { + printf "\e[1;31m%s\e[0m\n" "$@" +} + +printGreen() { + printf "\e[1;32m%s\e[0m\n" "$@" +} + +printYellow() { + printf "\e[1;33m%s\e[0m\n" "$@" +} + +printBlue() { + printf "\e[1;34m%s\e[0m\n" "$@" +} + +printMagenta() { + printf "\e[1;35m%s\e[0m\n" "$@" +} + +printCyan() { + printf "\e[1;36m%s\e[0m\n" "$@" +} + +printWhite() { + printf "\e[1;37m%s\e[0m\n" "$@" +} + +#### [DEPRECATED] ########################################################### +# if [[ -z "${HOSTFILE}" ]]; then +# makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` +# else +# echo "!! USING CUSTOM HOSTFILE FROM: ${HOSTFILE}" +# fi +# ---------------------------------------------------------------------------- +# setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` +# ---------------------------------------------------------------------------- +# TORCH_DEVICE=$(python3 -c 'import ezpz as ez; print(ez.get_torch_device())') +# printf %s "Using TORCH_DEVICE=${TORCH_DEVICE}" +# if [[ "${TORCH_DEVICE}" == "cuda" ]]; then +# printf %s "Setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True" +# PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +# fi +# ---------------------------------------------------------------------------- +# export MPICH_GPU_SUPPORT_ENABLED=1 +# export CUDA_DEVICE_MAX_CONNECTIONS=1 +# export NCCL_DEBUG=INFO +############################################################################# diff --git a/ALCF/mds_to_hf.py b/ALCF/mds_to_hf.py new file mode 100644 index 0000000000..a336788274 --- /dev/null +++ b/ALCF/mds_to_hf.py @@ -0,0 +1,91 @@ +# Usage : python mds_to_hf.py --mds_checkpoint --output_dir +# Tips : Do not run on login node. +# This script currently only takes care of tp=1. Takes a AuroraGPT Llama model trained with Megatron-DeepSpeed and converts to LLamaCausalForLM architecture from HuggingFace. + +import argparse +import torch +import pdb +import os +from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer + +def repeat_kv_wt(x,np): + return torch.repeat_interleave(x, dim=0, repeats=np) + +def Update_llama_config(Llama_config, mds_args): + if mds_args['swiglu']: + Llama_config.hidden_act = "silu" + Llama_config.hidden_size = mds_args['hidden_size'] + Llama_config.intermediate_size = mds_args['ffn_hidden_size'] + Llama_config.max_position_embeddings = mds_args['max_position_embeddings'] + Llama_config.num_attention_heads = mds_args['num_attention_heads'] + Llama_config.num_hidden_layers = mds_args['num_layers'] + Llama_config.num_key_value_heads = mds_args['num_key_value_heads'] + Llama_config.rms_norm_eps = mds_args['layernorm_epsilon'] + Llama_config.rope_theta = mds_args['rope_theta'] + Llama_config.vocab_size = mds_args['padded_vocab_size'] + return Llama_config + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--mds_checkpoint', required=True) + parser.add_argument('--output_dir', required=True) + args = parser.parse_args() + + # make output_dir if it does not exits. + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + filename = str(args.mds_checkpoint) + if not filename.split("/")[-1].startswith('mp_rank') and not filename.split("/")[-1].endswith('.pt'): + assert ("Provide the right file path, The file should be of format mp_rank_*.pt") + print(f"loading mds checkpoint {filename}") + + mds_model = torch.load(args.mds_checkpoint,map_location=torch.device('cpu')) + Llama_model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf",cache_dir='/eagle/datascience/vsastry/huggingface') + + Llama_config = Llama_model.config + Updated_Llama_config = Update_llama_config(Llama_config, mds_model['args'].__dict__) + # save the updated config.json file + Updated_Llama_config.to_json_file(os.path.join(args.output_dir,'config.json')) + + state_dict = {} + dim = mds_model['args'].__dict__['kv_channels'] + inv_freq = 1.0 / (mds_model['args'].__dict__['rope_theta'] ** (torch.arange(0,dim, 2).float() / dim)) + hidden_size = mds_model['args'].__dict__['hidden_size'] + kv_dim = mds_model['args'].__dict__['kv_channels'] * mds_model['args'].__dict__['num_key_value_heads'] + kv_groups = mds_model['args'].__dict__['num_attention_heads'] // mds_model['args'].__dict__['num_key_value_heads'] + for layer_i in range(Updated_Llama_config.__dict__['num_hidden_layers']): + # SELF ATTENTION layers. + # get the q, k, v weights separately. Keeping k and v at the GQA head dim, since the transformers/models/llama/modelling_utils will take care of it. + fused_qkv = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.self_attention.query_key_value.weight"] + state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = fused_qkv[0:hidden_size] + state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = fused_qkv[hidden_size:hidden_size+kv_dim] + #state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = repeat_kv_wt(fused_qkv[hidden_size:hidden_size+kv_dim], kv_groups) + state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = fused_qkv[hidden_size+kv_dim:hidden_size+2*kv_dim] + #state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = repeat_kv_wt(fused_qkv[hidden_size+kv_dim:hidden_size+2*kv_dim],kv_groups) + state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.self_attention.dense.weight"] + + # MLP Layers + fused_mlp = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.mlp.dense_h_to_4h.weight"] + chunked_mlp = torch.chunk(fused_mlp,2,dim=0) + state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = chunked_mlp[0] + state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = chunked_mlp[1] + state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.mlp.dense_4h_to_h.weight"] + + #LayerNorm weights and RoPe + state_dict[f"model.layers.{layer_i}.input_layernorm.weight"] = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.input_layernorm.weight"] + state_dict[f"model.layers.{layer_i}.post_attention_layernorm.weight"] = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.post_attention_layernorm.weight"] + + state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq + + # Get the non-encoder layer weights. + state_dict["model.embed_tokens.weight"] = mds_model['module']['language_model']['embedding']['word_embeddings']['weight'] + state_dict["model.norm.weight"] = mds_model['module']['language_model']['encoder']['final_layernorm.weight'] + state_dict["lm_head.weight"] = mds_model['module']['language_model']['output_layer']['weight'] + + # Save the model in the hf output path. + torch.save(state_dict, os.path.join(args.output_dir,"pytorch_model.bin")) + + + diff --git a/ALCF/pre-AuroraGPT/README.md b/ALCF/pre-AuroraGPT/README.md new file mode 100644 index 0000000000..e8f4127876 --- /dev/null +++ b/ALCF/pre-AuroraGPT/README.md @@ -0,0 +1,96 @@ +# Megatron-DeepSpeed (@ [ALCF](https://alcf.anl.gov)) + +![image](https://github.com/argonne-lcf/Megatron-DeepSpeed/assets/5234251/f06df155-30e8-4894-a4c2-c17ff4b34ada) + +We describe below the instructions for launching distributed training with +Microsoft's Megatron-DeepSpeed and briefly describe some parallelism +strategies and various optimizations that are supported. + +> [!IMPORTANT] +> We maintain this (forked) version at +> [`argonne-lcf/Megatron-DeepSpeed`](https://github.com/argonne-lcf/Megatron-DeepSpeed) +> that has some [helper scripts](#helper-scripts) for launching and setting +> various training options. +> +> These changes are entirely self-contained **HERE** in [`ALCF/`](.) + +## Setup + +1. Load `conda` and activate base environment: + + ```bash + # load conda + activate base env + module load conda/2023-10-04 ; conda activate base + ``` + +1. Clone + [`argonne-lcf/Megatron-DeepSpeed`](https://github.com/argonne-lcf/Megatron-DeepSpeed) + and navigate into it: + + ```bash + # clone + navigate into Megatron-DeepSpeed repo + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed + cd Megatron-DeepSpeed + ``` + +1. Make virtual environment (on top of base conda): + + ```bash + # make virtual environment (on top of base conda) + mkdir -p venvs/polaris/2023-10-04 + python3 -m venv venvs/polaris/2023-10-04 --system-site-packages + source venvs/polaris/2023-10-04/bin/activate + ``` + +1. Install missing dependency: + + ```bash + # install *missing dependency + python3 -m pip install "git+https://github.com/saforem2/ezpz" + ``` + +1. Launch training: + + ```bash + # ---- launch training ----------------------- + # - MODEL_SIZE_KEY: defined in ALCF/model.sh + # - other args: defined in ALCF/args.sh + # --------------------------------------------- + MODEL_SIZE_KEY="GPT25B" \ + SEQ_LEN=4096 \ + USE_FLASH_ATTN_V2=1 \ + MICRO_BATCH=1 \ + GAS=1 \ + SP_TYPE="megatron" \ + ZERO_STAGE=1 \ + ./ALCF/train-gpt3.sh + ``` + + +## Helper Scripts + +- [`pretrain_gpt_alcf.py`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/pretrain_gpt_alcf.py) +- 📂 [`ALCF/`](https://github.com/argonne-lcf/Megatron-DeepSpeed/tree/main/ALCF) + `├──` [`args.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/models.sh) + `├──` [`launch.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/launch.sh) + `├──` [`model.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/model.sh) + `├──` [`setup.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/setup.sh) + `├──` [`submit-pbs.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/submit-pbs.sh) + `├──` [`submit.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/submit.sh) + `└──` [`train-gpt3.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/train-gpt3.sh) + + +
+
pretrain_gpt_alcf.py +
Python module to be launched. Running `./ALCF/train-gpt3.sh` will automaticall build an `mpiexec` command and launch this module.
+
ALCF/train-gpt3.sh +
Main entry point for training. This script will automatically source the rest of the required ALCF/*.sh scripts below
+
ALCF/model.sh
+
Contains some example model architectures for GPT3-style models
+
ALCF/args.sh
+
Logic for parsing / setting up runtime options for Megatron and DeepSpeed.
+
ALCF/setup.sh
+
Locate and activate virtual environment to be used, ensure MPI variables are set properly
+
ALCF/launch.sh
+
Identify available resources and build the command to be ran i.e. figure out how many: `{nodes, GPUs per node, GPUs total}`, to pass to `mpi{run,exec}` then, use this to build `mpiexec {mpiexec-args} python3 pretrain_gpt.py`
+
diff --git a/ALCF/pre-AuroraGPT/args.sh b/ALCF/pre-AuroraGPT/args.sh new file mode 100755 index 0000000000..17e0018110 --- /dev/null +++ b/ALCF/pre-AuroraGPT/args.sh @@ -0,0 +1,533 @@ +#!/bin/bash --login + +function FindMegatron() { + MEGATRON_INSTALL=$(python3 -c 'import megatron; print(megatron.__file__)' | tail -1) + MEGATRON_DIR=$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1))) +} + +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' +} + +function join_by { local d=${1-} f=${2-}; if shift 2; then printf %s "$f" "${@/#/$d}"; fi; } + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + + +USER=$(whoami) +HERE=$(WhereAmI) +ALCF_DIR="${HERE}/ALCF" +PARENT=$(dirname "${ALCF_DIR}") +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "PARENT: ${PARENT}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + + +HOSTNAME=$(hostname) +sourceFile "${ALCF_DIR}/setup.sh" + +WORLD_SIZE="${NGPUS}" +PARALLEL_SIZE="${WORLD_SIZE}" +echo "NHOSTS * (NGPU / HOST) = $NHOSTS * $NGPU_PER_HOST = $NGPUS" + +export MODEL_SIZE_KEY="${MODEL_SIZE_KEY:-GPT13B}" +echo "==========================+" +echo "Using ${MODEL_SIZE_KEY}" +echo "==========================+" + +sourceFile "${ALCF_DIR}/model.sh" + +MODEL_TYPE=${MODEL_TYPE:-gpt} + +# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ Model Parallel / Pipeline Parallel ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +export DDP_IMPL="local" # FSDP | local | torch +export USE_ACTIVATION_CHECKPOINTING=1 # 1 | 0 +export SEQ_LEN=${SEQ_LEN:-2048} +export PPSIZE=${PPSIZE:-1} +export MICRO_BATCH=${MICRO_BATCH:-1} +export GRADIENT_ACCUMULATION_STEPS=${GAS:-1} +export MODEL_TYPE=${MODEL_TYPE:-"gpt"} # set bert or gpt +export SP_TYPE=${SP_TYPE:-"megatron"} # set ds or megatron +export ZERO_STAGE=${ZERO_STAGE:-1} +export MPSIZE=${MPSIZE:-${WORLD_SIZE:-1}} +export SPSIZE=${SPSIZE:-1} + +# +# Deal with Sequence Parallel implementation --------------------------------------- +# ---------------------------------------------------------------------------------- +if [[ ${SP_TYPE} == "ds" ]]; then + # NOTE: -------------------------------------------------------------------- + # SP_TYPE="ds" has NO effect, essentially running with no Seq. parallelism + # -------------------------------------------------------------------------- + if [[ "$MPSIZE" == "${WORLD_SIZE}" ]]; then + # hacky workaround to try and use SP_TYPE="ds" + MPSIZE="${WORLD_SIZE}" + # ------------------------------------------------------------------------ + # Update [2023-08-22]: Chengming mentioned that this is an internal issue + # and will NOT work currently + # ------------------------------------------------------------------------ + echo "Caught MPSIZE: $MPSIZE from env. Setting SPSIZE=1" + SPSIZE=1 + MPSIZE="${MPSIZE}" + else + echo "Didn't catch MPSIZE from env. Setting SPSIZE=${WORLD_SIZE}, MPSIZE=1" + MPSIZE=1 + SPSIZE="${WORLD_SIZE}" + fi + if [ -z "${ZERO_STAGE}" ]; then + echo "ZERO_STAGE not set, setting to 3 for ${SP_TYPE}" + ZERO_STAGE=3 + else + echo "Caught ZERO_STAGE=${ZERO_STAGE} with ${SP_TYPE}" + fi + export SPSIZE="${SPSIZE:-$WORLD_SIZE}" + export MPSIZE="${MPSIZE:-1}" + export USE_SEQUENCE_PARALLEL=0 + export ZERO_STAGE="${ZERO_STAGE}" +elif [[ ${SP_TYPE} == "megatron" ]]; then + # NOTE: -------------------------------------------------------------------------- + # SP_TYPE="megatron" will use Megatron's Seq. || implementation with ZERO_STAGE=0 + # -------------------------------------------------------------------------------- + [ "$SPSIZE" ] && echo "Caught SPSIZE: ${SPSIZE} from env" || SPSIZE=1 + [ "$MPSIZE" ] && echo "Caught MPSIZE: ${MPSIZE} from env" || MPSIZE="${WORLD_SIZE}" + [ "$ZERO_STAGE" ] && echo "Caught ${ZERO_STAGE} from env" || ZERO_STAGE=0 + [ "$USE_SEQUENCE_PARALLEL" ] && echo "Caught USE_SP: $USE_SEQUENCE_PARALLEL from env" || USE_SEQUENCE_PARALLEL=1 + if [[ ${PPSIZE} > 1 ]]; then # && ${MPSIZE}==${WORLD_SIZE} ]]; + MPSIZE=$(( WORLD_SIZE / PPSIZE )) + echo "Re-setting MPSIZE to ${WORLD_SIZE} / ${PPSIZE} = $(( WORLD_SIZE / PPSIZE ))" + echo "MPSIZE: $MPSIZE" + # MPSIZE="${WORLD_SIZE}/" + fi + export SPSIZE="${SPSIZE}" + export MPSIZE="${MPSIZE}" + export ZERO_STAGE="${ZERO_STAGE}" + export USE_SEQUENCE_PARALLEL="${USE_SEQUENCE_PARALLEL:-1}" +else + echo "Unexpected SP_TYPE: ${SP_TYPE}" + # exit 1 +fi +# ------------------------------------------------------------------------ +# +echo "####################################################" +echo "USING: ${SP_TYPE}" +echo "SPSIZE: ${SPSIZE}" +echo "PPSIZE: ${SPSIZE}" +echo "MPSIZE: ${MPSIZE}" +echo "ZERO_STAGE: ${ZERO_STAGE}" +echo "WORLD_SIZE: ${WORLD_SIZE}" +echo "USE_SEQUENCE_PARALLEL: ${USE_SEQUENCE_PARALLEL}" +echo "####################################################" + +echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++" +echo "${SP_TYPE} sequence parallelism, with: " +echo " {MPSIZE: ${MPSIZE}, SPSIZE: ${SPSIZE}, USE_SEQUENCE_PARALLEL: ${USE_SEQUENCE_PARALLEL}} !!" +echo "########################################################" + +GLOBAL_BATCH=$(( NGPUS * MICRO_BATCH * GRADIENT_ACCUMULATION_STEPS )) +echo "GB = NGPUS * MB * GAS = ${NGPUS} * ${MICRO_BATCH} * ${GRADIENT_ACCUMULATION_STEPS} = ${GLOBAL_BATCH}" + +GLOBAL_BATCH=$(( GLOBAL_BATCH / MPSIZE / PPSIZE / SPSIZE)) +echo "GB = (NGPUS * MB * GAS) / (MP * PP * SP) = (${NGPUS} * ${MICRO_BATCH} * ${GRADIENT_ACCUMULATION_STEPS}) / (${MPSIZE} * ${PPSIZE} * ${SPSIZE}) = ${GLOBAL_BATCH}" + +if [[ "${GLOBAL_BATCH}" == 0 ]]; then + GLOBAL_BATCH=1 +fi +# [ "${GLOBAL_BATCH:-${GLOBAL_BATCH}}" == 0 ] && GLOBAL_BATCH=1 || echo "GLOBAL_BATCH: ${GLOBAL_BATCH}" +export GLOBAL_BATCH="$GLOBAL_BATCH" + +DPSIZE=$(( $WORLD_SIZE / $PPSIZE / $MPSIZE )) + +# echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" +# echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + +echo "--------------------------------" +echo "GLOBAL_BATCH=${GLOBAL_BATCH}" +echo "USING DPSIZE: ${DPSIZE}" +echo "--------------------------------" + +# ┏━━━━━━━━━━━━┓ +# ┃ Data paths ┃ +# ┗━━━━━━━━━━━━┛ +if [[ $(hostname) == nid* || $(hostname) == login* ]]; then + DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" + DATA_TYPE="BookCorpusDataset_text_document" +elif [[ $(hostname) == theta* || $(hostname) == x3* ]]; then + DATA_PARENT="/lus/eagle/projects/datasets/Megatron-DeepSpeed/GenSLMSubSample200k" # GenSLMSubSample200k" + # DATA_TYPE="GenSLMSubSample200k" + # DATA_PARENT="/home/foremans/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed" + # DATA_TYPE="books-0001_text_document" + # DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" + DATA_TYPE="genslm_subsample_200k_sequence_document" +else + echo "Unable to determine DATA_PARENT for $(hostname)." + echo "Exiting!" + exit 1 +fi + +DATA_DIR="${DATA_PARENT}/dataset" +DATA_PATH="${DATA_DIR}/${DATA_TYPE}" +VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" +MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" +# # +# [ "$(hostname)==login*" ] && DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed"_ +# [ "$(hostname)==nid*" ] && DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" +# # [ "$(hostname)==theta*" ] && DATA_PARENT="/lus/eagle/projects/datasets/BookCorpusDataset" +# [ "$(hostname)==theta*" ] && DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" +# [ "$(hostname)==x3*" ] && DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" +# # "/lus/grand/projects/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DS-Benchmarking" +# # /lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k/dataset/genslm_subsample_200k_sequence_document" +# # [ "$(hostname)==x3*" ] && DATA_PARENT="/lus/eagle/projects/datasets/BookCorpusDataset" +# # /lus/grand/projects/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DS-Benchmarking" +# # /lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k/dataset/genslm_subsample_200k_sequence_document.bin +# +# # DATA_PATH=/lus/grand/projects/datascience/vsastry/genslm_subsample_200k_sequence_document/genslm_subsample_200k_sequence_document +# DATA_DIR="${DATA_PARENT}/dataset" +# DATA_PATH="${DATA_DIR}/genslm_subsample_200k_sequence_document" +# VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" +# MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" + +# ┏━━━━━━━━━━━━━━━━━━━┓ +# ┃ FILE I/O SETTINGS ┃ +# ┗━━━━━━━━━━━━━━━━━━━┛ +RUN_STR="gb${GLOBAL_BATCH}_mb${MICRO_BATCH}" +RUN_STR="nl${NLAYERS}_hs${HIDDEN}_${RUN_STR}" +RUN_STR="mp${MPSIZE}_pp${PPSIZE}_sp${SPSIZE}_${RUN_STR}" +RUN_STR="z${ZERO_STAGE}_seqlen${SEQ_LEN}_${RUN_STR}" +RUN_STR="${MODEL_SIZE}_${RUN_STR}" + +# if [[ "${USE_FLASH_ATTN}" == 0 ]]; then +# echo "Not using Flash Attention!!" +# else +# +if [[ "${USE_FLASH_ATTN1}" || "${USE_FLASH_ATTN_V1}" ]]; then + # Flash Attention 1 + [ "${USE_FLASH_ATTN}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" + [ "${USE_FLASH_ATTN1}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" + [ "${USE_FLASH_ATTN_V1}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" +elif [[ "${USE_FLASH_ATTN2}" || "${USE_FLASH_ATTN_V2}" ]]; then + # Flash Attention 2 + [ "${USE_FLASH_ATTN2}" ] && RUN_STR="flashAttn_v2_${RUN_STR}" + [ "${USE_FLASH_ATTN_V2}" ] && RUN_STR="flashAttn_v2_${RUN_STR}" +elif [[ "${USE_FLASH_ATTN_TRITON}" ]]; then + # Triton + Flash Attn + # Triton + Flash Attn + [ "${USE_FLASH_ATTN_TRITON}" ] && RUN_STR="flashAttn_triton_${RUN_STR}" +else + echo "Not using Flash Attention!" +fi + +if [[ $DDP_IMPL == 'FSDP' ]]; then + RUN_STR="FSDP_${RUN_STR}" +fi +if [[ $USE_ACTIVATION_CHECKPOINTING == 1 ]]; then + RUN_STR="actCkpt_${RUN_STR}" +fi +if [[ $USE_SEQUENCE_PARALLEL == 1 ]] ; then + RUN_STR="SP_${RUN_STR}" +fi + +RUN_STR="${MODEL_TYPE}_${RUN_STR}" + +OUTPUT_DIR="${PARENT}/outputs/${RUN_STR}" +CHECKPOINT_DIR="${PARENT}/checkpoints/$RUN_STR" +TENSORBOARD_DIR="${PARENT}/outputs/${RUN_STR}/tensorboard" + +DATE=$(date) +export DATE="${DATE}" +export RUN_STR="${RUN_STR}" +export MODEL_SIZE="$MODEL_SIZE" +export TENSORBOARD_DIR=$TENSORBOARD_DIR +export OUTPUT_DIR=$OUTPUT_DIR +mkdir -p "$OUTPUT_DIR/tensorboard/wandb" +mkdir -p "$CHECKPOINT_DIR" +mkdir -p "$TENSORBOARD_DIR" +mkdir -p "$OUTPUT_DIR" +echo "OUTPUT TO: ${OUTPUT_DIR}" + +# if [[ -z "${NVME_PATH}" ]]; then +# echo "NVME_PATH: $NVME_PATH" +# else +# if [[ $(hostname) == x* ]]; then +# export NVME_PATH="/local/scratch/" +# elif [[ $(hostname) == theta* ]]; then +# export NVME_PATH="/raid/scratch/" +# else +# export NVME_PATH="/tmp/" +# fi +# fi + +# echo "NVME_PATH: ${NVME_PATH}" + +if [[ $MODEL_TYPE == "gpt" ]] ; then + DATA_LOAD_ARGS="--data-path $DATA_PATH --vocab-file $VOCAB_FILE --merge-file $MERGE_FILE" +else + DATA_LOAD_ARGS="" +fi + +# Set to cpu for offloading to cpu for larger models +OFFLOAD_DEVICE="${OFFLOAD_DEVICE:-cpu}" +CPU_OPTIM=" --cpu-optimizer" + +# # Set to none and empty string for no cpu offloading +# OFFLOAD_DEVICE="none" +# CPU_OPTIM=" " + + +[ "${WANDB_MODE}" == "disabled" ] && WANDB_ENABLE="false" || WANDB_ENABLE="true" +echo "WANDB_ENABLE: ${WANDB_ENABLE}" + +# ┏━━━━━━━━━━━━━━━━━━┓ +# ┃ DeepSpeed Config ┃ +# ┗━━━━━━━━━━━━━━━━━━┛ +DS_CONFIG=${PARENT}/ds_config-gpt.json +echo "!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~!" +echo " DS_CONFIG: ${DS_CONFIG}" +echo "!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~!" + +if [[ $ZERO_STAGE == "3" ]] ; then + cat < "$DS_CONFIG" + { + "train_micro_batch_size_per_gpu": $MICRO_BATCH, + "steps_per_print": 1, + "wall_clock_breakdown" : true, + "gradient_accumulation_steps": $GRADIENT_ACCUMULATION_STEPS, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 3, + "stage3_max_live_parameters": 3e9, + "stage3_max_reuse_distance": 3e9, + "stage3_param_persistence_threshold": 1e5, + "stage3_prefetch_bucket_size": 1e9, + "contiguous_gradients": true, + "overlap_comm": true, + "reduce_bucket_size": 90000000, + "sub_group_size": 5e7, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "offload_optimizer": { + "device": "cpu", + "buffer_count": 4, + "pipeline_read": false, + "pipeline_write": false, + "pin_memory": true +} +}, +"fp16": { +"enabled": true, +"initial_scale_power" : 12, +"loss_scale_window": 1000, +"hysteresis": 2, +"min_loss_scale": 1 +}, +"aio": { +"block_size": 1048576, +"queue_depth": 16, +"single_submit": false, +"overlap_events": true, +"thread_count": 2 +}, +"flops_profiler": { +"enabled": true, +"profile_step": 1, +"module_depth": -1, +"top_modules": 3, +"detailed": true, +"output_file": null +}, +"comms_logger": { +"enabled": true, +"verbose": false, +"prof_all": false, +"debug": false +}, +"wandb": { +"enabled": $WANDB_ENABLE, +"project": "GenSLM-Megatron-DS" +} +} +EOT +else + cat < "$DS_CONFIG" + { + "train_micro_batch_size_per_gpu": $MICRO_BATCH, + "gradient_accumulation_steps": $GRADIENT_ACCUMULATION_STEPS, + "steps_per_print": 1, + "wall_clock_breakdown" : true, + "zero_force_ds_cpu_optimizer": false, + "zero_optimization": { + "stage": $ZERO_STAGE, + "allgather_partitions": true, + "reduce_scatter": true, + "allgather_bucket_size": 5e8, + "overlap_comm": true, + "contiguous_gradients": true, + "offload_param": { + "device": "cpu", + "nvme_path": "/raid/scratch", + "pin_memory": false + }, + "offload_optimizer": { + "device": "cpu", + "nvme_path": "/raid/scratch/" +} +}, +"scheduler": { +"type": "WarmupLR", +"params": { +"warmup_min_lr": 0, +"warmup_max_lr": 0.001, +"warmup_num_steps": 1000 +} +}, +"fp16": { +"enabled": true, +"initial_scale_power": 12 +}, +"flops_profiler": { +"enabled": true, +"profile_step": 1, +"module_depth": -1, +"top_modules": 3, +"detailed": true, +"output_file": null +}, +"comms_logger": { +"enabled": true, +"verbose": false, +"prof_all": false, +"debug": false +}, +"wandb": { +"enabled": $WANDB_ENABLE, +"project": "GenSLM-Megatron-DS" +} +} +EOT +fi + +# ┏━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ DeepSpeed Arguments ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━┛ +if [[ "$DDP_IMPL" != "FSDP" ]] ; then + ds_args="" + ds_args=" --deepspeed ${ds_args}" + ds_args=" --deepspeed_mpi ${ds_args}" + ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" + ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" + if [[ "$PPSIZE" == 1 ]]; then + ds_args="--no-pipeline-parallel ${ds_args}" + else + ds_args=" --pipeline-model-parallel-size ${PPSIZE} ${ds_args}" + fi + if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + ds_args=" --deepspeed-activation-checkpointing ${ds_args}" + fi +fi + +# ┏━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ MEGATRON-LM SETTINGS ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━┛ +gpt_args=( + "--no-async-tensor-model-parallel-allreduce" + "--seed ${RANDOM}" + "--DDP-impl ${DDP_IMPL}" + "--pipeline-model-parallel-size ${PPSIZE}" + "--tensor-model-parallel-size ${MPSIZE}" + "--ds-sequence-parallel-size ${SPSIZE}" + "--num-layers ${NLAYERS}" + "--hidden-size ${HIDDEN}" + "--num-attention-heads ${ATEN_HEADS}" + "--micro-batch-size ${MICRO_BATCH}" + "--global-batch-size ${GLOBAL_BATCH}" + "--seq-length ${SEQ_LEN}" + "--max-position-embeddings ${SEQ_LEN}" + "--train-iters 10" + "--lr-decay-iters 320000" + "--num-workers 0" + "$DATA_LOAD_ARGS" + "--data-impl mmap" + "--split 949,50,1" + "--distributed-backend nccl" + "--lr 0.00015" + "--lr-decay-style cosine" + "--min-lr 1.0e-5" + "--weight-decay 1e-2" + "--clip-grad 1.0" + "--lr-warmup-fraction .01" + "--log-interval 1" + "--save-interval 1000" + "--eval-interval 1000" + "--eval-iters 0" + "--override-opt_param-scheduler" + "--tensorboard-dir ${TENSORBOARD_DIR}" + "--log-timers-to-tensorboard" + "--tensorboard-log-interval 1" +) + + +if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + gpt_args+=( + "--checkpoint-activations" + "--checkpoint-num-layers 1" +) +fi + +if [[ "$DDP_IMPL" != "FSDP" ]] ; then + gpt_args+=( + # "${gpt_args[*]}" + "--fp16" +) +else + gpt_args+=( + "--bf16" +) +fi + +# Flash Attention v1 +if [[ "${USE_FLASH_ATTN1}" || "${USE_FLASH_ATTN_V1}" ]]; then + [ "${USE_FLASH_ATTN}" ] && gpt_args+=("--use-flash-attn-v1") + [ "${USE_FLASH_ATTN1}" ] && gpt_args+=("--use-flash-attn-v1") + [ "${USE_FLASH_ATTN_V1}" ] && gpt_args+=("--use-flash-attn-v1") +# Flash Attention 2 +elif [[ "${USE_FLASH_ATTN2}" || "${USE_FLASH_ATTN_V2}" ]]; then + [ "${USE_FLASH_ATTN2}" ] && gpt_args+=("--use-flash-attn-v2") + [ "${USE_FLASH_ATTN_V2}" ] && gpt_args+=("--use-flash-attn-v2") +# Triton + Flash Attn +elif [[ "${USE_FLASH_ATTN_TRITON}" ]]; then + [ "${USE_FLASH_ATTN_TRITON}" ] && gpt_args+=("--use-flash-attn-triton") +fi + +if [[ "$USE_SEQUENCE_PARALLEL" == 1 ]]; then + export CUDA_DEVICE_MAX_CONNECTIONS=1 + gpt_args+=( + "--sequence-parallel" +) +fi + +if [[ "$ZERO_STAGE" > "0" ]] ; then + gpt_args+=( + "--cpu-optimizer" +) +fi + +export gpt_args=( +"${gpt_args[*]}" +"${ds_args[*]}" +) +ARGS="$(join_by ' ' ${gpt_args[*]})" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "ARGS: ${ARGS}" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" diff --git a/ALCF/pre-AuroraGPT/launch.sh b/ALCF/pre-AuroraGPT/launch.sh new file mode 100755 index 0000000000..2dd834f568 --- /dev/null +++ b/ALCF/pre-AuroraGPT/launch.sh @@ -0,0 +1,193 @@ +#!/bin/bash --login + +HOST=$(hostname) + +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' +} + +# function join_by { +# local d=${1-} f=${2-}; if shift 2; then printf %s "$f" "${@/#/$d}"; fi; +# } + +USER=$(whoami) +HERE=$(WhereAmI) +ALCF_DIR="${HERE}/ALCF" +PARENT=$(dirname "${ALCF_DIR}") + +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "PARENT: ${PARENT}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source=./setup.sh + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + +MASTER_ADDR=$(uname -n) +MASTER_PORT=20010 +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +MPI_WRAPPER="${SCRIPT_DIR}/mpi_wrapper" + +# sourceFile "${ALCF_DIR}/args.sh" + +# MAIN="${PARENT}/pretrain_${MODEL_TYPE}.py" +MAIN="${PARENT}/pretrain_gpt_alcf.py" + +printJobInfo() { + echo "Job started at: ${TSTAMP} on $(hostname)" + echo "Job running in: ${DIR}" + echo "Training Llama2 with ${MODEL_SIZE} parameters" + echo "Writing logs to: ${OUTPUT_DIR}" + echo 'to view output: tail -f $(tail -1 logfiles)' + echo "i.e. tail -f $(tail -1 "${PARENT}"/logfiles)" +} + +launchJob() { + echo "using: $(which python3)" | tee -a "${OUTPUT_LOG}" + printJobInfo | tee -a "${OUTPUT_LOG}" + echo EXEC="${EXEC}" | tee -a "${OUTPUT_LOG}" + echo "Writing logs to: ${OUTPUT_LOG}" | tee -a "${OUTPUT_LOG}" + # ARGS="$@" + # export ARGS="$ARGS" + ${EXEC} "$@" # >> "${OUTPUT_LOG}" 2>&1 & +} + +# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ Use all available GPUs a single nodes ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +fullNode() { + echo "fullNode started" + echo "MPI_COMMAND ${MPI_COMMAND}" + echo "MPI_DEFAULTS ${MPI_DEFAULTS}" + echo "NGPUS ${NGPUS}" + echo "hostfile ${DIR}/hostfile" + echo "MAIN ${MAIN}" + echo "gpt_args ${ARGS}" + NHOSTS=$(wc -l < "${HOSTFILE}") + NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) + # hostname > $DIR/hostfile + echo "\ + Running on $NHOSTS hosts \ + with $NGPU_PER_HOST GPUs each \ + for a total of $NGPUS GPUs" + _EXEC=( + "${MPI_COMMAND}" + "${MPI_DEFAULTS}" + "${MPI_ELASTIC}" + "${MPI_WRAPPER}" + "${MASTER_ADDR}" + "${MASTER_PORT}" + "${MAIN}" + "${ARGS}" + # "${ds_args}" + ) + # EXEC=$(join_by ' ' "${EXEC[*]}") + EXEC="${EXEC[*]}" + OUTPUT_LOG="${OUTPUT_DIR}/logs/$USER-$HOST-nhosts${NHOSTS}-ngpu${NGPUS}-$TSTAMP.log" + mkdir -p "$(dirname "${OUTPUT_LOG}")" + echo "${OUTPUT_LOG}" >> "${PARENT}/logfiles" + printJobInfo | tee -a "${OUTPUT_LOG}" + launchJob "$@" 2>&1 | tee "${OUTPUT_LOG}" +} + + +function setupSrunOld() { + if [[ $(hostname) == login* || $(hostname) == nid* ]]; then + export NODELIST="${SLURM_JOB_NODELIST:-$(hostname)}" + export MACHINE="Perlmutter" + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + export SRUN_EXEC="srun -N ${NHOSTS} -n ${NGPUS} -l -u" + else + echo "Skipping setupSrun() on $(hostname)" + fi +} + +function setupSrun() { + if [[ $(hostname) == login* || $(hostname) == nid* ]]; then + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + export SRUN_EXEC="srun --gpus ${NGPUS} --gpus-per-node ${NGPU_PER_HOST} -N ${NHOSTS} -n ${NGPUS} -l -u --verbose" + else + echo "Skipping setupSrun() on $(hostname)" + fi +} + + +# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ Use all available GPUs on all available nodes ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +elasticDistributed() { + if [[ $(hostname) == theta* || $(hostname) == x3* ]]; then + if [[ $(hostname) == theta* ]]; then + echo "Setting up ThetaGPU from $(hostname)" + HOSTFILE="${HOSTFILE:-${COBALT_NODEFILE}}" + elif [[ $(hostname) == x3* ]]; then + echo "Setting up Polaris from $(hostname)" + HOSTFILE="${HOSFILE:-${PBS_NODEFILE}}" + else + echo "Unknown hostname $(hostname)" + exit 1 + fi + NHOSTS=$(wc -l < "${HOSTFILE}") + NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + export MASTER_ADDR="127.0.0.1" + export MASTER_PORT="5432" + EXEC_STR=( + "${MPI_COMMAND}" + "${MPI_DEFAULTS}" + "${MPI_ELASTIC}" + "$(which python3)" + "${MAIN}" + "${ARGS}" + # "${gpt_args}" + # "${ds_args}" + ) + elif [[ $(hostname) == nid* || $(hostname) == login* ]]; then + echo "Setting up from Perlmutter on $(hostname)" + # NHOSTS=${SLURM_NNODES-1} + MACHINE="Perlmutter" + setupPerlmutter + setupSrun + echo "SRUN_EXEC: ${SRUN_EXEC}" + export MASTER_ADDR="$SLURMD_NODENAME" + EXEC_STR=( + "${SRUN_EXEC}" + "$(which python3)" + "${MAIN}" + "${ARGS}" + # "${gpt_args}" + # "${ds_args}" + ) + else + echo "Unexpected hostname $(hostname)" + fi + export WORLD_SIZE="${NGPUS}" + echo "\ + Running on ${NHOSTS} hosts \ + with ${NGPU_PER_HOST} GPUs each \ + for a total of ${NGPUS} GPUs" + EXEC="${EXEC_STR[*]}" + OUTPUT_LOG="${OUTPUT_DIR}/logs/$USER-$HOST-nhosts${NHOSTS}-ngpu${NGPUS}-$TSTAMP.log" + echo "EXEC_STR: ${EXEC_STR}" + echo "Writing logs to: ${OUTPUT_LOG}" + mkdir -p "$(dirname "${OUTPUT_LOG}")" + echo "${OUTPUT_LOG}" >> "${PARENT}/logfiles" + printJobInfo | tee -a "${OUTPUT_LOG}" + launchJob "$@" >> "${OUTPUT_LOG}" 2>&1 & + PID=$! + wait $PID +} diff --git a/ALCF/pre-AuroraGPT/llama2_vars.sh b/ALCF/pre-AuroraGPT/llama2_vars.sh new file mode 100755 index 0000000000..2fc8c3898a --- /dev/null +++ b/ALCF/pre-AuroraGPT/llama2_vars.sh @@ -0,0 +1,435 @@ +#!/bin/bash +# This example script is contributed by external user https://github.com/nrailgun +# [2023-12-20]: Modified by [@saforem2](https://github.com/saforem2) +# set -ex +# +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' +} + +function join_by { local d=${1-} f=${2-}; if shift 2; then printf %s "$f" "${@/#/$d}"; fi; } + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + + +USER=$(whoami) +HERE=$(WhereAmI) +ALCF_DIR="${HERE}/ALCF" +PARENT=$(dirname "${ALCF_DIR}") +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "PARENT: ${PARENT}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + + +HOSTNAME=$(hostname) +sourceFile "${ALCF_DIR}/setup.sh" +sourceFile "${ALCF_DIR}/model.sh" + +WORLD_SIZE="${NGPUS}" +PARALLEL_SIZE="${WORLD_SIZE}" +echo "NHOSTS * (NGPU / HOST) = $NHOSTS * $NGPU_PER_HOST = $NGPUS" + +# MODEL_LLAMA_KEY="LLAMA-24L" +# HIDDEN_SIZE=2048 # e.g. llama-13b: 5120 +# FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824 +# NUM_LAYERS=24 # e.g. llama-13b: 40 +# NUM_HEADS=16 # e.g. llama-13b: 40 +# SEQ_LENGTH=2048 +# NUM_KV_HEADS=4 # llama2 70B uses GQA +# FFN_HIDDEN_SIZE=5504 +# NUM_HEADS=16 # e.g. llama-13b: 40 +###################################### +# Change the below configurations here +# wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin +# wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx + +USER=$(whoami) +HERE=$(WhereAmI) +ALCF_DIR="${HERE}/ALCF" +PARENT=$(dirname "${ALCF_DIR}") +# echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +# echo "ALCF_DIR: ${ALCF_DIR}" +# # echo "PARENT: ${PARENT}" +# echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + +MEGATRON_DIR="${HERE}" + +# DATA_DIR="${HOME}/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DeepSpeed/dataset/" +BASE_PATH="${MEGATRON_DIR}" +DS_CONFIG=${BASE_PATH}/deepspeed.json +DATASET_1="${DATA_DIR}/BookCorpusDataset_text_document" +# DATASET_1="./tmp/data/bookcorpus_train_1m_text_sentence" +DATASET="1 ${DATASET_1}" +# CHECKPOINT_PATH=./tmp +TOKENIZER_PATH=./tmp/tokenizer.model # offical llama tokenizer.model + +if [[ $(hostname) == nid* || $(hostname) == login* ]]; then + DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" + DATA_TYPE="BookCorpusDataset_text_document" +elif [[ $(hostname) == theta* || $(hostname) == x3* ]]; then + DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" + DATA_TYPE="genslm_subsample_200k_sequence_document" +else + echo "Unable to determine DATA_PARENT for $(hostname)." + echo "Exiting!" + exit 1 +fi + +DATA_DIR="${DATA_PARENT}/dataset" +DATA_PATH="${DATA_DIR}/${DATA_TYPE}" +VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" +MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" + +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "MEGATRON_DIR: ${MEGATRON_DIR}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + +DATA_LOAD_ARGS=( + "--data-path $DATA_PATH" + "--vocab-file $VOCAB_FILE" + "--merge-file $MERGE_FILE" +) + +# TP=2 +# PP=2 +# ZERO_STAGE=0 + +GPUS_PER_NODE=$(nvidia-smi -L | wc -l) +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=$(wc -l < "${PBS_NODEFILE:-${COBALT_NODEFILE:-1}}") +NODE_RANK=0 + +# TP=2 +# PP=2 +# ZERO_STAGE=0 +# +export SEQ_LENGTH=${SEQ_LENGTH:-2048} +export NUM_KV_HEADS=4 # llama2 70B uses GQA +export MODEL_SIZE_KEY="${MODEL_SIZE_KEY:-LLAMA_7B}" +export MODEL_TYPE=${MODEL_TYPE:-llama} +echo "==========================+" +echo "Using ${MODEL_SIZE_KEY}" +echo "==========================+" + + +export DDP_IMPL="local" +export GAS=${GAS:-1} +export MPSIZE=${MPSIZE:-1} +export SPSIZE=${SPSIZE:-1} +export PPSIZE=${PPSIZE:-1} +export SP_TYPE=${SP_TYPE:-"ds"} +export MICRO_BATCH=${MICRO_BATCH:-1} + +# export HIDDEN_SIZE=2048 # e.g. llama-13b: 5120 +# export FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824 +# export NUM_LAYERS=24 # e.g. llama-13b: 40 +# export NUM_HEADS=16 # e.g. llama-13b: 40 +# export SEQ_LENGTH=${SEQ_LENGTH:-2048} +# export NUM_KV_HEADS=4 # llama2 70B uses GQA + +NUM_KV_HEADS=4 # llama2 70B uses GQA +FFN_HIDDEN_SIZE=5504 + +# GLOBAL_BATCH=32 # e.g. llama: 4M tokens +TRAIN_STEPS=250000 # e.g. llama: 1T tokens / 4M tokens_per_batch = 250000 steps +LR=3e-4 +MIN_LR=3e-5 +LR_WARMUP_STEPS=2000 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +## Activation checkpointing saves GPU memory, but reduces training speed +# activation_checkpoint="true" +activation_checkpoint="false" + +# Below configuration required for llama model as per llama paper +# --no-query-key-layer-scaling \ +# --attention-dropout 0 \ +# --hidden-dropout 0 \ +# --use-rotary-position-embeddings \ +# --untie-embeddings-and-output-weights \ +# --swiglu \ +# --normalization rmsnorm \ +# --disable-bias-linear \ +###################################### + +# Deal with Sequence Parallel implementation --------------------------------------- +# ---------------------------------------------------------------------------------- +if [[ ${SP_TYPE} == "ds" ]]; then + # NOTE: -------------------------------------------------------------------- + # SP_TYPE="ds" has NO effect, essentially running with no Seq. parallelism + # -------------------------------------------------------------------------- + if [[ "$MPSIZE" == "${WORLD_SIZE}" ]]; then + # hacky workaround to try and use SP_TYPE="ds" + MPSIZE="${WORLD_SIZE}" + # ------------------------------------------------------------------------ + # Update [2023-08-22]: Chengming mentioned that this is an internal issue + # and will NOT work currently + # ------------------------------------------------------------------------ + echo "Caught MPSIZE: $MPSIZE from env. Setting SPSIZE=1" + SPSIZE=1 + MPSIZE="${MPSIZE}" + else + echo "Didn't catch MPSIZE from env. Setting SPSIZE=${WORLD_SIZE}, MPSIZE=1" + MPSIZE=1 + SPSIZE="${WORLD_SIZE}" + fi + if [ -z "${ZERO_STAGE}" ]; then + echo "ZERO_STAGE not set, setting to 3 for ${SP_TYPE}" + ZERO_STAGE=3 + else + echo "Caught ZERO_STAGE=${ZERO_STAGE} with ${SP_TYPE}" + fi + export SPSIZE="${SPSIZE:-$WORLD_SIZE}" + export MPSIZE="${MPSIZE:-1}" + export USE_SEQUENCE_PARALLEL=0 + export ZERO_STAGE="${ZERO_STAGE}" +elif [[ ${SP_TYPE} == "megatron" ]]; then + # NOTE: -------------------------------------------------------------------------- + # SP_TYPE="megatron" will use Megatron's Seq. || implementation with ZERO_STAGE=0 + # -------------------------------------------------------------------------------- + [ "$SPSIZE" ] && echo "Caught SPSIZE: ${SPSIZE} from env" || SPSIZE=1 + [ "$MPSIZE" ] && echo "Caught MPSIZE: ${MPSIZE} from env" || MPSIZE="${WORLD_SIZE}" + [ "$ZERO_STAGE" ] && echo "Caught ${ZERO_STAGE} from env" || ZERO_STAGE=0 + [ "$USE_SEQUENCE_PARALLEL" ] && echo "Caught USE_SP: $USE_SEQUENCE_PARALLEL from env" || USE_SEQUENCE_PARALLEL=1 + if [[ ${PPSIZE} > 1 ]]; then # && ${MPSIZE}==${WORLD_SIZE} ]]; + MPSIZE=$(( WORLD_SIZE / PPSIZE )) + echo "Re-setting MPSIZE to ${WORLD_SIZE} / ${PPSIZE} = $(( WORLD_SIZE / PPSIZE ))" + echo "MPSIZE: $MPSIZE" + # MPSIZE="${WORLD_SIZE}/" + fi + export SPSIZE="${SPSIZE}" + export MPSIZE="${MPSIZE}" + export ZERO_STAGE="${ZERO_STAGE}" + export USE_SEQUENCE_PARALLEL="${USE_SEQUENCE_PARALLEL:-1}" +else + echo "Unexpected SP_TYPE: ${SP_TYPE}" + # exit 1 +fi +# ------------------------------------------------------------------------ +# +echo "####################################################" +echo "USING: ${SP_TYPE}" +echo "SPSIZE: ${SPSIZE}" +echo "PPSIZE: ${SPSIZE}" +echo "MPSIZE: ${MPSIZE}" +echo "ZERO_STAGE: ${ZERO_STAGE}" +echo "WORLD_SIZE: ${WORLD_SIZE}" +echo "USE_SEQUENCE_PARALLEL: ${USE_SEQUENCE_PARALLEL}" +echo "####################################################" + +echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++" +echo "${SP_TYPE} sequence parallelism, with: " +echo " {MPSIZE: ${MPSIZE}, SPSIZE: ${SPSIZE}, USE_SEQUENCE_PARALLEL: ${USE_SEQUENCE_PARALLEL}} !!" +echo "########################################################" + +GLOBAL_BATCH=$(( NGPUS * MICRO_BATCH * GAS )) + +GLOBAL_BATCH=$(( GLOBAL_BATCH / MPSIZE / PPSIZE / SPSIZE)) + +echo "GB = (NGPUS * MB * GAS) / (MP * PP * SP * DP) = ${NGPUS} * ${MICRO_BATCH} * ${GAS} = ${GLOBAL_BATCH} / (${MPSIZE} * ${PPSIZE} * ${PPSIZE})" +# echo "GB = (NGPUS * MB * GAS) / (MP * PP * SP) = (${NGPUS} * ${MICRO_BATCH} * ${GAS}) / (${MPSIZE} * ${PPSIZE} * ${SPSIZE}) = ${GLOBAL_BATCH}" + +if [[ "${GLOBAL_BATCH}" == 0 ]]; then + GLOBAL_BATCH=1 +fi +# [ "${GLOBAL_BATCH:-${GLOBAL_BATCH}}" == 0 ] && GLOBAL_BATCH=1 || echo "GLOBAL_BATCH: ${GLOBAL_BATCH}" + +DPSIZE=$(( $WORLD_SIZE / $PPSIZE / $MPSIZE )) + +export GLOBAL_BATCH="$(( GLOBAL_BATCH * DPSIZE ))" +# export GLOBAL_BATCH="$GLOBAL_BATCH" +# echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" +# echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + +echo "--------------------------------" +echo "GLOBAL_BATCH=${GLOBAL_BATCH}" +echo "USING DPSIZE: ${DPSIZE}" +echo "--------------------------------" + +# REMAINDER=$(( GLOBAL_BATCH % (MICRO_BATCH * DPSIZE))) +# if [[ "${GLOBAL_BATCH} "]] + + + + +cat < $DS_CONFIG +{ + "train_batch_size" : $GLOBAL_BATCH, + "train_micro_batch_size_per_gpu": $MICRO_BATCH, + "steps_per_print": 1, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "wandb": { + "enabled": true, + "project": "GenSLM-Megatron-DS" + } +} +EOT + +ds_args="" +ds_args=" --deepspeed ${ds_args}" +ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" +ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" + +if [ "${activation_checkpoint}" = "true" ]; then + ds_args="--deepspeed-activation-checkpointing ${ds_args}" + + ## old argument for recomputing the transformer layer + # ds_args="--checkpoint-activations ${ds_args}" + + ## new argument for recomputing the transformer layer + ds_args="--recompute-granularity full --recompute-method uniform ${ds_args}" + ## new argument for recomputing only the attention layer + # ds_args="--recompute-granularity selective ${ds_args}" +fi + + +DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" + +# torchrun $DISTRIBUTED_ARGS \ +# pretrain_gpt.py \ + + +# ┏━━━━━━━━━━━━━━━━━━━┓ +# ┃ FILE I/O SETTINGS ┃ +# ┗━━━━━━━━━━━━━━━━━━━┛ +RUN_STR="gb${GLOBAL_BATCH}_mb${MICRO_BATCH}" +RUN_STR="nl${NLAYERS}_hs${HIDDEN}_${RUN_STR}" +RUN_STR="mp${MPSIZE}_pp${PPSIZE}_sp${SPSIZE}_${RUN_STR}" +RUN_STR="z${ZERO_STAGE}_seqlen${SEQ_LEN}_${RUN_STR}" +RUN_STR="${MODEL_SIZE_KEY}_${RUN_STR}" + +# if [[ "${USE_FLASH_ATTN}" == 0 ]]; then +# echo "Not using Flash Attention!!" +# else +# +if [[ "${USE_FLASH_ATTN1}" || "${USE_FLASH_ATTN_V1}" ]]; then + # Flash Attention 1 + [ "${USE_FLASH_ATTN}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" + [ "${USE_FLASH_ATTN1}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" + [ "${USE_FLASH_ATTN_V1}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" +elif [[ "${USE_FLASH_ATTN2}" || "${USE_FLASH_ATTN_V2}" ]]; then + # Flash Attention 2 + [ "${USE_FLASH_ATTN2}" ] && RUN_STR="flashAttn_v2_${RUN_STR}" + [ "${USE_FLASH_ATTN_V2}" ] && RUN_STR="flashAttn_v2_${RUN_STR}" +elif [[ "${USE_FLASH_ATTN_TRITON}" ]]; then + # Triton + Flash Attn + # Triton + Flash Attn + [ "${USE_FLASH_ATTN_TRITON}" ] && RUN_STR="flashAttn_triton_${RUN_STR}" +else + echo "Not using Flash Attention!" +fi + +if [[ $DDP_IMPL == 'FSDP' ]]; then + RUN_STR="FSDP_${RUN_STR}" +fi +if [[ $USE_ACTIVATION_CHECKPOINTING == 1 ]]; then + RUN_STR="actCkpt_${RUN_STR}" +fi +if [[ $USE_SEQUENCE_PARALLEL == 1 ]] ; then + RUN_STR="SP_${RUN_STR}" +fi + +RUN_STR="${MODEL_SIZE}_${RUN_STR}" + +OUTPUT_DIR="${PARENT}/outputs/${RUN_STR}" +CHECKPOINT_DIR="${PARENT}/checkpoints/$RUN_STR" +TENSORBOARD_DIR="${PARENT}/outputs/${RUN_STR}/tensorboard" + +DATE=$(date) +export DATE="${DATE}" +export RUN_STR="${RUN_STR}" +export MODEL_SIZE="${MODEL_SIZE:-${MODEL_SIZE_KEY}}" +export MODEL_SIZE="$MODEL_SIZE" +export TENSORBOARD_DIR=$TENSORBOARD_DIR +export OUTPUT_DIR=$OUTPUT_DIR +mkdir -p "$OUTPUT_DIR/tensorboard/wandb" +mkdir -p "$CHECKPOINT_DIR" +mkdir -p "$TENSORBOARD_DIR" +mkdir -p "$OUTPUT_DIR" +echo "OUTPUT TO: ${OUTPUT_DIR}" + +gpt_args=( + "--tensor-model-parallel-size $MPSIZE" + "--pipeline-model-parallel-size $PPSIZE" + "--num-layers $NLAYERS" + "--hidden-size $HIDDEN" + "--ffn-hidden-size $FFN_HIDDEN_SIZE" + "--num-attention-heads $ATEN_HEADS" + "--micro-batch-size $MICRO_BATCH" + "--global-batch-size $GLOBAL_BATCH" + "--seq-length $SEQ_LENGTH" + "--max-position-embeddings $SEQ_LENGTH" + "--train-iters $TRAIN_STEPS" + "--save $CHECKPOINT_DIR" + "--load $CHECKPOINT_DIR" + "--data-path $DATASET" + "--data-impl mmap" + "--tokenizer-type GPTSentencePieceTokenizer" + "--tokenizer-model $TOKENIZER_PATH" + "--split 949,50,1" + "--distributed-backend nccl" + "--lr $LR" + "--lr-decay-style cosine" + "--min-lr $MIN_LR" + "--weight-decay $WEIGHT_DECAY" + "--clip-grad $GRAD_CLIP" + "--lr-warmup-iters $LR_WARMUP_STEPS" + "--optimizer adam" + "--adam-beta1 0.9" + "--adam-beta2 0.95" + "--log-interval 1" + "--save-interval 10000" + "--eval-interval 1000" + "--eval-iters 10" + "--bf16" + "--no-query-key-layer-scaling" + "--attention-dropout 0" + "--hidden-dropout 0" + "--use-rotary-position-embeddings" + "--untie-embeddings-and-output-weights" + "--swiglu" + "--normalization rmsnorm" + "--disable-bias-linear" + "--num-key-value-heads $NUM_KV_HEADS" + "--tensorboard-dir ${TENSORBOARD_DIR}" + "--log-timers-to-tensorboard" + "--tensorboard-log-interval 1" + "--data-path $DATA_PATH" + "--vocab-file $VOCAB_FILE" + "--merge-file $MERGE_FILE" +) + +# DATA_LOAD_ARGS=( +# "--data-path $DATA_PATH" +# "--vocab-file $VOCAB_FILE" +# "--merge-file $MERGE_FILE" +# ) + +export gpt_args=( + "${gpt_args[*]}" + "${ds_args[*]}" + # "${DATA_LOAD_ARGS[*]}" +) +ARGS="$(join_by ' ' ${gpt_args[*]})" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "ARGS: ${ARGS}" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +# gpt_args+="${ds_args}" +# gpt_args+="${DATA_LOAD_ARGS}" diff --git a/ALCF/pre-AuroraGPT/model.sh b/ALCF/pre-AuroraGPT/model.sh new file mode 100755 index 0000000000..ef01f5541d --- /dev/null +++ b/ALCF/pre-AuroraGPT/model.sh @@ -0,0 +1,383 @@ +#!/bin/bash --login +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ GPT MODEL SETTINGS ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ Model / Architecture settings ┃ +# ┃ ---------------------------------------------------- ┃ +# ┃ GPT-3 models use 2K sequence length/context window ┃ +# ┃ The "GPT-3 XXX" below are configs from GPT-3 paper ┃ +# ┃ https://arxiv.org/abs/2005.14165, choose based on ┃ +# ┃ your desired model size or build your own configs ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ + +declare -A A_NLAYERS +declare -A A_HIDDEN +declare -A A_ATEN_HEADS + +# | =================== Llama 2 Architecture ==================== | +# | Hidden Size | Inter. Size | Atten Heads | Layers | Model Size | +# |:-----------:|:-----------:|:-----------:|:------:|:----------:| +# | 4096 | 11008 | 32 | 32 | 7b | +# | 5120 | 13824 | 40 | 40 | 13b | +# | 8192 | 28672 | 64 | 80 | 70b | + +MODEL_LLAMA_7B_KEY="LLAMA_7B" +A_NLAYERS[$MODEL_LLAMA_7B_KEY]=32 +A_ATEN_HEADS[$MODEL_LLAMA_7B_KEY]=32 +A_HIDDEN[$MODEL_LLAMA_7B_KEY]=4096 + +MODEL_LLAMA_13B_KEY="LLAMA_13B" +A_NLAYERS[$MODEL_LLAMA_13B_KEY]=40 +A_ATEN_HEADS[$MODEL_LLAMA_13B_KEY]=40 +A_HIDDEN[$MODEL_LLAMA_13B_KEY]=5120 + +MODEL_LLAMA_70B_KEY="LLAMA_70B" +A_NLAYERS[$MODEL_LLAMA_70B_KEY]=80 +A_ATEN_HEADS[$MODEL_LLAMA_70B_KEY]=64 +A_HIDDEN[$MODEL_LLAMA_70B_KEY]=8192 + +# HIDDEN_SIZE=4096 +# NUM_LAYERS=24 # e.g. llama-13b: 40 + + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ Llama2 ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE_KEY="LLAMA_24L" +# HIDDEN_SIZE=2048 # e.g. llama-13b: 5120 +# FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824 +# NUM_LAYERS=24 # e.g. llama-13b: 40 +# NUM_HEADS=16 # e.g. llama-13b: 40 +# SEQ_LENGTH=2048 +# NUM_KV_HEADS=4 # llama2 70B uses GQA +# + + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3 Small: 125M ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +MODEL_125M_KEY="GPT125M" +A_NLAYERS[$MODEL_125M_KEY]=12 +A_HIDDEN[$MODEL_125M_KEY]=768 +A_ATEN_HEADS[$MODEL_125M_KEY]=16 + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ BERT: 1.2B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="BERT1.2B" +# NLAYERS=24 +# HIDDEN=2048 +# ATEN_HEADS=128 + +BERT_1_2B_KEY="BERT1.2B" +A_NLAYERS[$BERT_1_2B_KEY]=24 +A_HIDDEN[$BERT_1_2B_KEY]=2048 +A_ATEN_HEADS[$BERT_1_2B_KEY]=128 + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 1.5B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="1.5B" +# NLAYERS=48 +# HIDDEN=1536 +# ATEN_HEADS=24 + +MODEL_1_5B_KEY="GPT1_5B" +A_NLAYERS[$MODEL_1_5B_KEY]=48 +A_HIDDEN[$MODEL_1_5B_KEY]=1536 +A_ATEN_HEADS[$MODEL_1_5B_KEY]=24 + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 1.5B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="1.5B" +# NLAYERS=48 +# HIDDEN=1600 +# ATEN_HEADS=25 + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 2.7B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="2.7B" +# NLAYERS=32 +# HIDDEN=2560 +# ATEN_HEADS=32 + +MODEL_2_7B_KEY="GPT2_7B" +A_NLAYERS[$MODEL_2_7B_KEY]=32 +A_HIDDEN[$MODEL_2_7B_KEY]=2560 +A_ATEN_HEADS[$MODEL_2_7B_KEY]=32 + +# ┏━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ ✓ GPT-3: 6.7B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="6.7B" +# NLAYERS=32 +# HIDDEN=4096 +# ATEN_HEADS=32 + +MODEL_6_7B_KEY="GPT6_7B" +A_NLAYERS[$MODEL_6_7B_KEY]=32 +A_HIDDEN[$MODEL_6_7B_KEY]=4096 +A_ATEN_HEADS[$MODEL_6_7B_KEY]=32 + +# ┏━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ ✓ GPT-3: 13B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="13B" +# NLAYERS=40 +# HIDDEN=5120 +# ATEN_HEADS=40 + +MODEL_13B_KEY="GPT13B" +A_NLAYERS[$MODEL_13B_KEY]=40 +A_HIDDEN[$MODEL_13B_KEY]=5120 +A_ATEN_HEADS[$MODEL_13B_KEY]=64 + +# ┏━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ ✓ GPT-3: 18.4B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="18.4B" +# NLAYERS=40 +# HIDDEN=6144 +# ATEN_HEADS=48 + +# ┏━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ ✓ GPT-3: 20B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="20B" +# NLAYERS=44 +# HIDDEN=6144 +# ATEN_HEADS=64 + +# ┏━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 25B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="25B" +# NLAYERS=64 +# ------------ +# HIDDEN=5760 # DEFAULT (no flash attn) +# ATEN_HEADS=64 +# ------------ +# HIDDEN=5888 # headdim = 5888 / 46 = 128 +# ATEN_HEADS=46 +# ----------------- +# -- FLASH ATTN -- +# headdim = 5760 / 80 = 72 +# HIDDEN=5760 +# ATEN_HEADS=80 +# ------------ + +MODEL_25B_KEY="GPT25B" +A_NLAYERS[$MODEL_25B_KEY]=64 +A_HIDDEN[$MODEL_25B_KEY]=6144 +A_ATEN_HEADS[$MODEL_25B_KEY]=64 + +# ┏━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 30B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="30B" +# NLAYERS=64 +# HIDDEN=6144 +# ATEN_HEADS=64 + +# head size must be divisible by 8 (requirements of flash attention) +# head num must be divisible by sequence/tensor parallel size +MODEL_30B_KEY="GPT30B" +A_NLAYERS[$MODEL_30B_KEY]=64 +A_HIDDEN[$MODEL_30B_KEY]=6144 +A_ATEN_HEADS[$MODEL_30B_KEY]=64 + +# ┏━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 33B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="33B" +# NLAYERS=80 +# HIDDEN=5760 +# ATEN_HEADS=80 + +# MODEL_33B_KEY="GPT33B" +# A_NLAYERS[$MODEL_33B_KEY]=80 +# A_HIDDEN[$MODEL_33B_KEY]=5760 +# A_ATEN_HEADS[$MODEL_33B_KEY]=80 + +MODEL_33B_KEY="GPT33B" +A_NLAYERS[$MODEL_33B_KEY]=80 +A_HIDDEN[$MODEL_33B_KEY]=6144 +A_ATEN_HEADS[$MODEL_33B_KEY]=64 + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 145B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="145B" +# NLAYERS=80 +# HIDDEN=12288 +# ATEN_HEADS=96 +# +GPT145B_HIDDEN=12288 +GPT145B_ATEN_HEADS=96 + +MODEL_145B_2L_KEY="GPT145B_2L" +A_NLAYERS[$MODEL_145B_2L_KEY]=2 +A_HIDDEN[$MODEL_145B_2L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_2L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_4L_KEY="GPT145B_4L" +A_NLAYERS[$MODEL_145B_4L_KEY]=4 +A_HIDDEN[$MODEL_145B_4L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_4L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_5L_KEY="GPT145B_5L" +A_NLAYERS[$MODEL_145B_5L_KEY]=5 +A_HIDDEN[$MODEL_145B_5L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_5L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_6L_KEY="GPT145B_6L" +A_NLAYERS[$MODEL_145B_6L_KEY]=6 +A_HIDDEN[$MODEL_145B_6L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_6L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_8L_KEY="GPT145B_8L" +A_NLAYERS[$MODEL_145B_8L_KEY]=8 +A_HIDDEN[$MODEL_145B_8L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_8L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_10L_KEY="GPT145B_10L" +A_NLAYERS[$MODEL_145B_10L_KEY]=10 +A_HIDDEN[$MODEL_145B_10L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_10L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_12L_KEY="GPT145B_12L" +A_NLAYERS[$MODEL_145B_12L_KEY]=12 +A_HIDDEN[$MODEL_145B_12L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_12L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_16L_KEY="GPT145B_16L" +A_NLAYERS[$MODEL_145B_16L_KEY]=16 +A_HIDDEN[$MODEL_145B_16L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_16L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_24L_KEY="GPT145B_24L" +A_NLAYERS[$MODEL_145B_24L_KEY]=24 +A_HIDDEN[$MODEL_145B_24L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_24L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_32L_KEY="GPT145B_32L" +A_NLAYERS[$MODEL_145B_32L_KEY]=32 +A_HIDDEN[$MODEL_145B_32L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_32L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_48L_KEY="GPT145B_48L" +A_NLAYERS[$MODEL_145B_48L_KEY]=48 +A_HIDDEN[$MODEL_145B_48L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_48L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_64L_KEY="GPT145B_64L" +A_NLAYERS[$MODEL_145B_64L_KEY]=64 +A_HIDDEN[$MODEL_145B_64L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_64L_KEY]="${GPT145B_ATEN_HEADS}" + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 175B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="175B" +# NLAYERS=96 +# HIDDEN=12288 +# ATEN_HEADS=96 +# if [ -z "$NLAYERS" ]; then +# A_NLAYERS[$MODEL_145B_KEY]="${NLAYERS}" +# echo "Caught NLAYERS=${NLAYERS} from env, using this value!" +# else +# A_NLAYERS[$MODEL_145B_KEY]=80 +# echo "Using default NLAYERS=80" +# fi +MODEL_145B_KEY="GPT145B" +A_NLAYERS[$MODEL_145B_KEY]=80 +A_HIDDEN[$MODEL_145B_KEY]=12288 +A_ATEN_HEADS[$MODEL_145B_KEY]=96 + + + +MODEL_1T_HIDDEN=25600 +MODEL_1T_ATEN_HEADS=160 +MODEL_1T_1L_KEY="GPT1T_1L" +A_NLAYERS[$MODEL_1T_1L_KEY]=1 +A_HIDDEN[$MODEL_1T_1L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_1L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_2L_KEY="GPT1T_2L" +A_NLAYERS[$MODEL_1T_2L_KEY]=2 +A_HIDDEN[$MODEL_1T_2L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_2L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_4L_KEY="GPT1T_4L" +A_NLAYERS[$MODEL_1T_4L_KEY]=4 +A_HIDDEN[$MODEL_1T_4L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_4L_KEY]="${MODEL_1T_ATEN_HEADS}" + + +MODEL_1T_8L_KEY="GPT1T_8L" +A_NLAYERS[$MODEL_1T_8L_KEY]=8 +A_HIDDEN[$MODEL_1T_8L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_8L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_16L_KEY="GPT1T_16L" +A_NLAYERS[$MODEL_1T_16L_KEY]=16 +A_HIDDEN[$MODEL_1T_16L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_16L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_24L_KEY="GPT1T_24L" +A_NLAYERS[$MODEL_1T_24L_KEY]=24 +A_HIDDEN[$MODEL_1T_24L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_24L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_30L_KEY="GPT1T_30L" +A_NLAYERS[$MODEL_1T_30L_KEY]=30 +A_HIDDEN[$MODEL_1T_30L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_30L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_32L_KEY="GPT1T_32L" +A_NLAYERS[$MODEL_1T_32L_KEY]=32 +A_HIDDEN[$MODEL_1T_32L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_32L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_60L_KEY="GPT1T_60L" +A_NLAYERS[$MODEL_1T_60L_KEY]=60 +A_HIDDEN[$MODEL_1T_60L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_60L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_64L_KEY="GPT1T_64L" +A_NLAYERS[$MODEL_1T_64L_KEY]=64 +A_HIDDEN[$MODEL_1T_64L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_64L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_120L_KEY="GPT1T_120L" +A_NLAYERS[$MODEL_1T_120L_KEY]=120 +A_HIDDEN[$MODEL_1T_120L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_120L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_128L_KEY="GPT1T_128L" +A_NLAYERS[$MODEL_1T_128L_KEY]=128 +A_HIDDEN[$MODEL_1T_128L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_128L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_256L_KEY="GPT1T_256L" +A_NLAYERS[$MODEL_1T_256L_KEY]=256 +A_HIDDEN[$MODEL_1T_256L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_256L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_512L_KEY="GPT1T_512L" +A_NLAYERS[$MODEL_1T_512L_KEY]=512 +A_HIDDEN[$MODEL_1T_512L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_512L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_1024L_KEY="GPT1T_1024L" +A_NLAYERS[$MODEL_1T_1024L_KEY]=1024 +A_HIDDEN[$MODEL_1T_1024L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_1024L_KEY]="${MODEL_1T_ATEN_HEADS}" + +export MODEL_SIZE="${MODEL_SIZE_KEY}" +export NLAYERS="${A_NLAYERS[$MODEL_SIZE_KEY]}" +export HIDDEN="${A_HIDDEN[$MODEL_SIZE_KEY]}" +export ATEN_HEADS="${A_ATEN_HEADS[$MODEL_SIZE_KEY]}" diff --git a/ALCF/pre-AuroraGPT/setup.sh b/ALCF/pre-AuroraGPT/setup.sh new file mode 100755 index 0000000000..5259ea03a0 --- /dev/null +++ b/ALCF/pre-AuroraGPT/setup.sh @@ -0,0 +1,291 @@ +#!/bin/bash --login +# +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' +} + +HERE=$(WhereAmI) +# ALCF_DIR=$(find "${HERE}" -name "ALCF") +ALCF_DIR="${HERE}/ALCF" +PARENT=$(dirname "${ALCF_DIR}") + +function join_by { local d=${1-} f=${2-}; if shift 2; then printf %s "$f" "${@/#/$d}"; fi; } + +function setupVenv() { + VENV_DIR="$1" + # VENV_DIR="${PARENT}/venvs/perlmutter/torch2.0.1/" + if [[ -d "${VENV_DIR}" ]]; then + echo "Found venv at: ${VENV_DIR}" + source "${VENV_DIR}/bin/activate" + else + echo "Skipping setupVenv() on $(hostname)" + fi +} + +function loadCondaEnv() { + if [[ "${CONDA_EXE}" ]]; then + echo "Already inside ${CONDA_EXE}, exiting!" + else + MODULE_STR="$1" + module load "conda/${MODULE_STR}" + conda activate base + fi +} + +function thetagpuMPI() { + if [[ $(hostname) == theta* ]]; then + export HOSTFILE="${HOSTFILE:-${COBALT_NODEFILE}}" + NHOSTS=$(wc -l < "${HOSTFILE}") + NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) + NVME_PATH="/raid/scratch/" + MPI_COMMAND=$(which mpirun) + # export PATH="${CONDA_PREFIX}/bin:${PATH}" + MPI_DEFAULTS="\ + --hostfile ${HOSTFILE} \ + -x CFLAGS \ + -x LDFLAGS \ + -x http_proxy \ + -x CUDA_DEVICE_MAX_CONNECTIONS \ + -x PYTHONUSERBASE \ + -x https_proxy \ + -x PATH \ + -x LD_LIBRARY_PATH" + MPI_ELASTIC="\ + -n ${NGPUS} \ + -npernode ${NGPU_PER_HOST}" + # _MPI_DEFAULTS=( + # "--hostfile ${HOSTFILE}" + # "-x CFLAGS" + # "-x LDFLAGS" + # "-x http_proxy" + # "-x PYTHONUSERBASE" + # "-x https_proxy" + # "-x PATH" + # "-x CUDA_DEVICE_MAX_CONNECTIONS" + # "-x LD_LIBRARY_PATH" + # ) + # _MPI_ELASTIC=( + # "-n ${NGPUS}" + # "-npernode ${NGPU_PER_HOST}" + # ) + # export MPI_DEFAULTS="$(join_by ' ' ${_MPI_DEFAULTS})" + # export MPI_ELASTIC="$(join_by ' ' ${_MPI_ELASTIC})" + else + echo "Skipping thetaGPUMPI() on $(hostname)" + fi +} + +function polarisMPI() { + if [[ $(hostname) == x3* ]]; then + export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" + export NHOSTS=$(wc -l < "${HOSTFILE}") + export NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + export NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) + export MPI_COMMAND=$(which mpiexec) + export NVME_PATH="/local/scratch/" + MPI_DEFAULTS="\ + --envall \ + --verbose \ + --hostfile ${HOSTFILE}" + MPI_ELASTIC="\ + -n ${NGPUS} \ + --ppn ${NGPU_PER_HOST}" + # _MPI_DEFAULTS=( + # "--envall" + # "--verbose" + # "--hostfile ${HOSTFILE}" + # ) + # _MPI_ELASTIC=( + # "-n ${NGPUS}" + # "--ppn ${NGPU_PER_HOST}" + # ) + # export MPI_DEFAULTS="$(join_by ' ' ${_MPI_DEFAULTS})" + # export MPI_ELASTIC="$(join_by ' ' ${_MPI_ELASTIC})" + else + echo "Skipping polarisMPI() on $(hostname)" + fi +} + +function setupMPI() { + if [[ $(hostname) == theta* ]]; then + echo "Setting up MPI on ThetaGPU from $(hostname)" + thetagpuMPI + elif [[ $(hostname) == x* ]]; then + echo "Setting up MPI on Polaris from $(hostname)" + polarisMPI + else + echo "Skipping setupMPI() on hostname $(hostname)" + fi + echo "++ SetupMPI() +++++++++++++++++++++++++++++++++" + echo "Using HOSTFILE: $HOSTFILE" + echo "NHOSTS: ${NHOSTS}" + echo "NGPU_PER_HOST: ${NGPU_PER_HOST}" + echo "NGPUS: $NGPUS" + echo "+++++++++++++++++++++++++++++++++++++++++++++++" +} + +function condaPolaris() { + if [[ $(hostname) == x3* ]]; then + DATE_STR="2023-10-04" + [ "${CONDA_EXE}" ] || loadCondaEnv "${DATE_STR}" + [ "${VIRTUAL_ENV}" ] || setupVenv "${DATE_STR}" + else + echo "Skipping condaPolaris() on $(hostname)" + fi +} + +function condaThetaGPU() { + if [[ $(hostname) == theta* ]]; then + DATE_STR="2023-01-11" + [ "${CONDA_EXE}" ] && echo "Caught CONDA_EXE: ${CONDA_EXE}" || loadCondaEnv "${DATE_STR}" + [ "${VIRTUAL_ENV}" ] && echo "Caught VIRTUAL_ENV: ${VIRTUAL_ENV}" || setupVenv "${DATE_STR}" + # [ "${CONDA_EXE}" ] || loadCondaEnv "${DATE_STR}" + # [ "${VIRTUAL_ENV}" ] || setupVenv "${DATE_STR}" + else + echo "Skipping condaThetaGPU() on $(hostname)" + fi +} + +function setupThetaGPU() { + export LAB="ALCF" + export MACHINE="ThetaGPU" + if [[ $(hostname) == theta* ]]; then + setupMPI + DATE_STR="2023-01-11" + [ "${CONDA_EXE}" ] && echo "Caught CONDA_EXE: ${CONDA_EXE}" || loadCondaEnv "${DATE_STR}" + [ "${VIRTUAL_ENV}" ] && echo "Caught VIRTUAL_ENV: ${VIRTUAL_ENV}" || setupVenv "${DATE_STR}" + else + echo "Skipping setupThetaGPU() on $(hostname)" + fi +} + +function setupPolaris() { + export LAB="ALCF" + export MACHINE="Polaris" + if [[ $(hostname) == x3* ]]; then + # SETUP MPI -------------------------------- + setupMPI + # SETUP Python -------------------------------- + DATE_STR="2023-09-29" + [ "${CONDA_EXE}" ] && echo "Caught CONDA_EXE: ${CONDA_EXE}" || loadCondaEnv "${DATE_STR}-unstable" + [ "${VIRTUAL_ENV}" ] && echo "Caught VIRTUAL_ENV: ${VIRTUAL_ENV}" || setupVenv "${DATE_STR}" + else + echo "Skipping setupPolaris() on $(hostname)" + fi +} + + +function setupALCF() { + if [[ $(hostname) == theta* || $(hostname) == x3* ]]; then + setupMPI + if [[ $(hostname) == theta* ]]; then + echo "Setting up ThetaGPU from $(hostname)" + setupThetaGPU + elif [[ $(hostname) == x3* ]]; then + echo "Setting up Polaris from $(hostname)" + setupPolaris + else + echo "Unknown hostname $(hostname) in setupALCF()" + fi + else + echo "Skipping setupALCF() on $(hostname)" + fi +} + + + +function setupSrun() { + if [[ $(hostname) == login* || $(hostname) == nid* ]]; then + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + export SRUN_EXEC="srun -N ${NHOSTS} -n ${NGPUS} -l -u" + else + echo "Skipping setupSrun() on $(hostname)" + fi +} + +# ┏━━━━━━━┓ +# ┃ NERSC ┃ +# ┗━━━━━━━┛ +function setupPerlmutter() { + if [[ $(hostname) == login* || $(hostname) == nid* ]]; then + module load libfabric cudatoolkit pytorch/2.0.1 + [ $SLURM_JOB_ID ] \ + && echo "Caught SLURM_JOB_ID: ${SLURM_JOB_ID}" \ + || echo "!!!!!! Running without SLURM allocation !!!!!!!!" + # if [[ $(hostname) == login* ]]; then + # export MACHINE="NERSC" + # module load pytorch/2.0.1 + # export NHOSTS=1 + # export NGPU_PER_HOST=1 + # export NGPUS=1 + # # echo "$(hostname)" > "${HERE}/hostfile" + # elif [[ $(hostname) == nid* ]]; then + # export NODE_RANK=0 + export NODELIST="${SLURM_JOB_NODELIST:-$(hostname)}" + # export CUDA_DEVICE_MAX_CONNECTIONS=1 + export MACHINE="Perlmutter" + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + # else + # echo "Unexpected $(hostname) on NERSC" + # fi + echo "+++++++++++++++++++++++++++++++++++" + echo "Using python: $(which python3)" + echo "+++++++++++++++++++++++++++++++++++" + else + echo "Skipping setupPerlmutter() on $(hostname)" + fi +} + + +function setupMachine() { + HOSTNAME="$(hostname)" + if [[ $(hostname) == theta* || $(hostname) == x3* ]]; then + export LAB="ALCF" + setupALCF + # [ "${HOSTNAME}==theta*" ] && condaThetaGPU + # [ "${HOSTNAME}==x3*" ] && condaPolaris + elif [[ $(hostname) == nid* || $(hostname) == login* ]]; then + export LAB="NERSC" + setupSrun + setupPerlmutter + # [ "${HOSTNAME}==login*" ] && setupPerlmutter + # [ "${HOSTNAME}==nid*" ] && setupPerlmutter + else + echo "Unexpected hostname: $(hostname)" + fi +} + +# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ SETUP CONDA + MPI ENVIRONMENT @ ALCF ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +function setup() { + export NCCL_DEBUG=warn + # TORCH_EXTENSIONS_DIR="${HERE}/.cache/torch_extensions" + export WANDB_CACHE_DIR="./cache/wandb" + setupMachine + PYTHON_EXECUTABLE="$(which python3)" + export PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" + echo "USING PYTHON: $(which python3)" + # echo "CFLAGS: ${CFLAGS}" + # echo "LDFLAGS: ${LDFLAGS}" + # export NODE_RANK=0 + export NNODES=$NHOSTS + export GPUS_PER_NODE=$NGPU_PER_HOST + export WORLD_SIZE=$NGPUS + export NGPUS="${NGPUS}" + export NHOSTS="${NHOSTS}" + export NGPU_PER_HOST="${NGPU_PER_HOST}" + export CUDA_DEVICE_MAX_CONNECTIONS=1 + echo "########################################" + echo "NHOSTS: ${NHOSTS}" + echo "NGPU_PER_HOST: ${NGPU_PER_HOST}" + echo "NGPUS: (${NHOSTS} * ${NGPU_PER_HOST}) = ${NGPUS}" + echo "########################################" +} + +setup diff --git a/ALCF/pre-AuroraGPT/submit-pbs.sh b/ALCF/pre-AuroraGPT/submit-pbs.sh new file mode 100755 index 0000000000..712a932656 --- /dev/null +++ b/ALCF/pre-AuroraGPT/submit-pbs.sh @@ -0,0 +1,263 @@ +#!/bin/bash --login +# + +cd "${PBS_O_WORKDIR}" || exit + +# cd "${PBS_O_WORKDIR}" +# +# echo "PBS_O_WORKDIR: ${PBS_O_WORKDIR}" +# +# echo "__________________________________________________________________________________" +# cd ~/datascience/foremans/locations/polaris/projects/saforem2/Megatron-DS-Benchmarking/ +# echo "pwd: $(pwd)" +# echo "__________________________________________________________________________________" + +# SOURCE=${BASH_SOURCE[0]} +# while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# SOURCE=$(readlink "$SOURCE") +# [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +# done +# DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" + +# HERE=$(python3 -c 'import os; print(os.getcwd())') +# ALCF_DIR="${HERE}/ALCF" + +ALCF_DIR="$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1)))/ALCF" +PARENT=$(dirname "${ALCF_DIR}") +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "PARENT: ${PARENT}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + + +TSTAMP=$(tstamp) +echo "┌──────────────────────────────────────────────────────────────────┐" +#####"│ Job Started at 2023-08-04-121535 on polaris-login-04 by foremans │" +echo "│ Job Started at ${TSTAMP} on $(hostname) by $USER │" +echo "│ in: ${PARENT}" +echo "└──────────────────────────────────────────────────────────────────┘" +# echo "------------------------------------------------------------------------" + +getValFromFile() { + FILE=$1 + KEY=$2 + echo "getting ${KEY} from ${FILE}" + if [[ -f "${FILE}" ]]; then + VAL="$(cat "${FILE}" | grep -E "^${KEY}=" | sed "s/${KEY}=//g" | sed 's/\"//g')" + echo "setting ${KEY}: ${VAL}" + export "${KEY}"="${VAL}" + fi +} + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} +# +# sourceFile "${DIR}/setup.sh" +# sourceFile "${DIR}/model.sh" +# sourceFile "${DIR}/args.sh" +# sourceFile "${DIR}/launch.sh" +# +# export USE_ACTIVATION_CHECKPOINTING=1 # 1 | 0 +# export SEQ_LEN=${SEQ_LEN:-1024} +# export MPSIZE=${MPSIZE:-1} +# export PPSIZE=${PPSIZE:-1} +# export SPSIZE=${SPSIZE:-1} +# export MICRO_BATCH=${MICRO_BATCH:-1} +# export ZERO_STAGE=${ZERO_STAGE:-1} # 0 | 1 | 2 | 3 +# export NHOSTS="$NHOSTS" +# export GRADIENT_ACCUMULATION_STEPS=${GAS:-1} +# export USE_SEQUENCE_PARALLEL=${USE_SEQUENCE_PARALLEL:-0} # 1 | 0 +# +# +# export MODEL_SIZE_KEY="GPT1_5B" +# export SEQ_LEN=1024 +# export USE_FLASH_ATTN=1 +# export MICRO_BATCH=4 +# export WORLD_SIZE=8 +# export SP_TYPE="ds" +# export SPSIZE=8 +# export PPSIZE=1 +# export MPSIZE=1 +# export ZERO_STAGE=3 +# export USE_SEQUENCE_PARALLEL=0 + + +# getValFromFile "${DIR}/model.sh" MODEL_SIZE +# getValFromFile "${DIR}/args.sh" PPSIZE +# getValFromFile "${DIR}/args.sh" MPSIZE +# getValFromFile "${DIR}/args.sh" MICRO_BATCH +# getValFromFile "${DIR}/args.sh" GRADIENT_ACCUMULATION_STEPS +# +# MODEL_SIZE="${MODEL_SIZE}" +# PPSIZE="${PPSIZE}" +# MPSIZE="${MPSIZE}" +# MICRO_BATCH="${MICRO_BATCH}" +# GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS}" + +QUEUE=$1 +NUM_NODES=$2 +DURATION=$3 +PROJECT=$4 + +# MODEL_SIZE_KEY=$5 +# SEQ_LEN=$6 +# USE_FLASH_ATTN=$7 +# MICRO_BATCH=$8 +# GAS=$9 +# SP_TYPE=$10 + +# MODEL_SIZE_KEY="GPT6_7B" SEQ_LEN=2048 USE_FLASH_ATTN=0 MICRO_BATCH=1 GAS=1 SP_TYPE="deepspeed" ./ALCF/submit-pbs.sh debug-scaling 4 00:30:00 datascience + +# export MICRO_BATCH=${MICRO_BATCH:-1} +# export MICRO_BATCH="${MICRO_BATCH}" +# export MODEL_SIZE="${MODEL_SIZE}" +# # export GAS="${GRADIENT_ACCUMULATION_STEPS}" +# export GRADIENT_ACCUMULATION_STEPS=${GAS:-1} +# +# export DDP_IMPL="local" # FSDP | local | torch +# # export USE_FLASH_ATTN=${USE_FLASH_ATTN:-0} # 1 | 0 +# # export USE_ACTIVATION_CHECKPOINTING=1 # 1 | 0 +# export SEQ_LEN=${SEQ_LEN:-1024} +# # export MPSIZE=${MPSIZE:-1} +# export PPSIZE=${PPSIZE:-1} +# export SPSIZE=${SPSIZE:-1} +# export MICRO_BATCH=${MICRO_BATCH:-1} +# export ZERO_STAGE=${ZERO_STAGE:-1} # 0 | 1 | 2 | 3 +# # export NHOSTS="$NHOSTS" +# export GRADIENT_ACCUMULATION_STEPS=${GAS:-1} +# export USE_SEQUENCE_PARALLEL=${USE_SEQUENCE_PARALLEL:-0} # 1 | 0 +# + +if [ -z "${MODEL_SIZE_KEY}" ]; then + echo "ERROR: MODEL_SIZE_KEY not set" + exit 1 +fi + +if [ -z "${SEQ_LEN}" ]; then + echo "ERROR: SEQ_LEN not set" + echo "Using default SEQ_LEN=2048" + echo "Set SEQ_LEN=XXXX to change" + SEQ_LEN=2048 +fi + +if [ -z "${USE_FLASH_ATTN}" ]; then + echo "ERROR: USE_FLASH_ATTN not set" + echo "Not using flash attn! Set USE_FLASH_ATTN=1 to use" + USE_FLASH_ATTN=0 +fi + +if [ -z "${MICRO_BATCH}" ]; then + echo "ERROR: MICRO_BATCH not set" + echo "Using MICRO_BATCH=1" + MICRO_BATCH=1 +fi + +if [ -z "${GAS}" ]; then + echo "ERROR: GAS not set" + echo "Using GAS=1" + GAS=1 +fi + +if [ -z "${SP_TYPE}" ]; then + echo "ERROR: SP_TYPE not set" + echo "Using SP_TYPE=megatron" + SP_TYPE="megatron" +fi + +export GAS="${GAS}" +export SEQ_LEN="${SEQ_LEN}" +export SP_TYPE="${SP_TYPE}" +export MICRO_BATCH="${MICRO_BATCH}" +export MODEL_SIZE_KEY="${MODEL_SIZE_KEY}" +export USE_FLASH_ATTN="${USE_FLASH_ATTN}" + +echo "-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-" +echo "| MODEL_SIZE_KEY: ${MODEL_SIZE_KEY}" +echo "| SEQ_LEN: ${SEQ_LEN}" +echo "| USE_FLASH_ATTN: ${USE_FLASH_ATTN}" +echo "| MICRO_BATCH: ${MICRO_BATCH}" +echo "| GAS: ${GAS}" +echo "| SP_TYPE: ${SP_TYPE}" +echo "-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-" + +export QUEUE="${QUEUE}" +export DURATION="${DURATION}" +export TSTAMP="${TSTAMP}" +export NUM_NODES="${NUM_NODES}" +export PROJECT="${PROJECT}" + +RUN_NAME="N${NUM_NODES}-${TSTAMP}" +# RUN_NAME="mb${MICRO_BATCH}-gas${GAS}-${RUN_NAME}" +# RUN_NAME="GPT3-${MODEL_SIZE}-${RUN_NAME}" +RUN_NAME="${MODEL_SIZE_KEY}-${SP_TYPE}-mb${MICRO_BATCH}-gas${GAS}-seqlen${SEQ_LEN}-${RUN_NAME}" +export RUN_NAME="${RUN_NAME}" + +echo "QUEUE=$QUEUE" +echo "PROJECT=$PROJECT" +echo "DURATION=$DURATION" +echo "TSTAMP=$TSTAMP" +echo "NUM_NODES=$NUM_NODES" +echo "RUN_NAME: ${RUN_NAME}" +# echo "MODEL_SIZE=$MODEL_SIZE" +# echo "GAS=$GRADIENT_ACCUMULATION_STEPS" + +# QSUB_ARGS=( +# "-q ${QUEUE}" +# "-A ${PROJECT}" +# "-N ${RUN_NAME}" +# "-l select=${NUM_NODES}" +# "-l walltime=${DURATION}" +# "-l filesystems=eagle:home:grand" +# "${DIR}/submit.sh" +# ) + +OUTPUT=$(qsub \ + -q "${QUEUE}" \ + -A "${PROJECT}" \ + -N "${RUN_NAME}" \ + -l select="${NUM_NODES}" \ + -l walltime="${DURATION}" \ + -l filesystems=eagle:home:grand \ + "${ALCF_DIR}/submit.sh") + +# OUTPUT=$(qsub "${QSUB_ARGS[@]}") + +PBS_JOBID=$(echo "${OUTPUT}" | cut --delimiter="." --fields=1) +export PBS_JOBID="${PBS_JOBID}" +# echo "${TSTAMP} ${PBS_JOBID} " + +PBS_JOBSTR=( + "PBS_JOBID=${PBS_JOBID}" + "QUEUE=$QUEUE" + "PROJECT=$PROJECT" + "DURATION=$DURATION" + "TSTAMP=$TSTAMP" + "NUM_NODES=$NUM_NODES" + # "MODEL_SIZE=$MODEL_SIZE" + "RUN_NAME: ${RUN_NAME}" +) + # "GAS=$GRADIENT_ACCUMULATION_STEPS" + +TODAY=$(echo "${TSTAMP}" | cut --delimiter="-" --fields=1,2,3) +OUTFILE="${PARENT}/pbslogs/${TODAY}/${PBS_JOBID}.txt" + +if [[ ! -d $(dirname "${OUTFILE}") ]]; then + mkdir -p "$(dirname "${OUTFILE}")" +fi + +echo "Writing PBS_JOBSTR to ${OUTFILE}" +echo "${PBS_JOBSTR[@]}" >> "${OUTFILE}" +# echo "${PBS_JOBSTR[@]}" | tee -a "${OUTFILE}" + +echo "┌───────────────────────────────────────────┐" +echo "│ To view job output, run: \`pbstail ${PBS_JOBID}\` │" +echo "└───────────────────────────────────────────┘" diff --git a/ALCF/pre-AuroraGPT/submit.sh b/ALCF/pre-AuroraGPT/submit.sh new file mode 100755 index 0000000000..6842eb66b6 --- /dev/null +++ b/ALCF/pre-AuroraGPT/submit.sh @@ -0,0 +1,66 @@ +#!/bin/bash --login +#PBS -V +# +cd "${PBS_O_WORKDIR}" || exit + +TSTAMP=$(date "+%Y-%m-%d-%H%M%S") +export TSTAMP="$TSTAMP" + +ALCF_DIR="$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1)))/ALCF" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + +# SOURCE=${BASH_SOURCE[0]} +# while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# SOURCE=$(readlink "$SOURCE") +# [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +# done +# DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ Make sure we're not already running; if so, exit here ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +PIDS=$(ps aux | grep pretrain_gpt_alcf.py | grep -v grep | awk '{print $2}') +if [ -n "${PIDS}" ]; then + echo "Already running! Exiting!" + exit 1 +fi + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ source ./launch.sh ┃ +#┃ which then sources ./{args.sh,setup.sh} ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +# SCRIPT_DIR="/lus/grand/projects/datascience/foremans/locations/polaris/projects/saforem2/Megatron-DS-Benchmarking/ALCF/" +MODEL_FILE="${ALCF_DIR}/model.sh" +ARGS_FILE="${ALCF_DIR}/args.sh" +LAUNCH_FILE="${ALCF_DIR}/launch.sh" +SETUP_FILE="${ALCF_DIR}/setup.sh" + +sourceFile "${SETUP_FILE}" +sourceFile "${ARGS_FILE}" +sourceFile "${MODEL_FILE}" +sourceFile "${LAUNCH_FILE}" +# if [[ -f "${LAUNCH_FILE}" ]]; then +# echo "source-ing ${LAUNCH_FILE}" +# # shellcheck source=./launch.sh +# source "${LAUNCH_FILE}" +# else +# echo "ERROR: UNABLE TO SOURCE ${LAUNCH_FILE}" +# fi + +setup +elasticDistributed "$@" +wait $! diff --git a/ALCF/pre-AuroraGPT/train-gpt3.sh b/ALCF/pre-AuroraGPT/train-gpt3.sh new file mode 100755 index 0000000000..79e2661d16 --- /dev/null +++ b/ALCF/pre-AuroraGPT/train-gpt3.sh @@ -0,0 +1,72 @@ +#!/bin/bash --login + +TSTAMP=$(date "+%Y-%m-%d-%H%M%S") + +# HERE=$(python3 -c 'import os; print(os.getcwd())') +# ALCF_DIR="${HERE}/ALCF" +# +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' +} + +HERE=$(WhereAmI) +# ALCF_DIR=$(find "${HERE}" -name "ALCF") +ALCF_DIR="${HERE}/ALCF" + + +# ALCF_DIR="$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1)))/ALCF" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + +# SOURCE=${BASH_SOURCE[0]} +# while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# SOURCE=$(readlink "$SOURCE") +# [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +# done +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ Make sure we're not already running; if so, exit here ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +PIDS=$(ps aux | grep -E "$USER.+mpi.+pretrain_gpt_alcf.py" | grep -v grep | awk '{print $2}') +if [ -n "${PIDS}" ]; then + echo "Already running! Exiting!" + exit 1 +fi + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ source ./launch.sh ┃ +#┃ which then sources ./{args.sh,setup.sh} ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +LAUNCH_FILE="${ALCF_DIR}/launch.sh" + +sourceFile "${ALCF_DIR}/setup.sh" +sourceFile "${ALCF_DIR}/model.sh" +sourceFile "${ALCF_DIR}/args.sh" +sourceFile "${LAUNCH_FILE}" + +setup +# singleGPU "$@" 2>&1 & +# fullNode "$@" 2>&1 & +TORCH_VERSION=$(python3 -c 'import torch; print(torch.__version__)') +export TORCH_VERSION=$TORCH_VERSION +export CUDA_DEVICE_MAX_CONNECTIONS=1 +# elasticDistributed "$@" 2>&1 & +# elasticDistributed "$@" +# PID=$! +# wait $PID +elasticDistributed "$@" 2>&1 & diff --git a/ALCF/pre-AuroraGPT/train-llama.sh b/ALCF/pre-AuroraGPT/train-llama.sh new file mode 100755 index 0000000000..6483b55c54 --- /dev/null +++ b/ALCF/pre-AuroraGPT/train-llama.sh @@ -0,0 +1,74 @@ +#!/bin/bash --login + +TSTAMP=$(date "+%Y-%m-%d-%H%M%S") + +# HERE=$(python3 -c 'import os; print(os.getcwd())') +# ALCF_DIR="${HERE}/ALCF" +# +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' +} + +HERE=$(WhereAmI) +# ALCF_DIR=$(find "${HERE}" -name "ALCF") +ALCF_DIR="${HERE}/ALCF" + + +# ALCF_DIR="$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1)))/ALCF" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + +# SOURCE=${BASH_SOURCE[0]} +# while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# SOURCE=$(readlink "$SOURCE") +# [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +# done +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ Make sure we're not already running; if so, exit here ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +PIDS=$(ps aux | egrep "$USER.+mpi.+pretrain.+.py" | grep -v grep | awk '{print $2}') +if [ -n "${PIDS}" ]; then + echo "Already running! Exiting!" + exit 1 +fi + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ source ./launch.sh ┃ +#┃ which then sources ./{args.sh,setup.sh} ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +LAUNCH_FILE="${ALCF_DIR}/launch.sh" + +sourceFile "${ALCF_DIR}/setup.sh" +# sourceFile "${ALCF_DIR}/model.sh" +# sourceFile "${ALCF_DIR}/args.sh" +sourceFile "${ALCF_DIR}/llama2_vars.sh" +sourceFile "${LAUNCH_FILE}" + +setup +# singleGPU "$@" 2>&1 & +# fullNode "$@" 2>&1 & +TORCH_VERSION=$(python3 -c 'import torch; print(torch.__version__)') +export TORCH_VERSION=$TORCH_VERSION +export CUDA_DEVICE_MAX_CONNECTIONS=1 +# elasticDistributed "$@" 2>&1 & +# elasticDistributed "$@" +# PID=$! +# wait $PID +elasticDistributed "$@" 2>&1 & + diff --git a/ALCF/sunspot-env-2024-04-15-002.sh b/ALCF/sunspot-env-2024-04-15-002.sh new file mode 100644 index 0000000000..3b7155675d --- /dev/null +++ b/ALCF/sunspot-env-2024-04-15-002.sh @@ -0,0 +1,4 @@ +#!/bin/bash --login + +module use /soft/preview-modulefiles/24.086.0 +module load frameworks/2024.04.15.002.lua diff --git a/ALCF/sunspot-env.sh b/ALCF/sunspot-env.sh new file mode 100644 index 0000000000..8b02542b20 --- /dev/null +++ b/ALCF/sunspot-env.sh @@ -0,0 +1,8 @@ +#!/bin/bash --login +# +module use /home/ftartagl/graphics-compute-runtime/modulefiles +module load graphics-compute-runtime/agama-ci-devel-803.29 +module load spack-pe-gcc/0.6.1-23.275.2 +module load gcc/12.2.0 +module use /soft/preview-modulefiles/24.086.0 +module load oneapi/release/2024.04.15.001 diff --git a/ALCF/test_alcf.sh b/ALCF/test_alcf.sh new file mode 100644 index 0000000000..853addc59d --- /dev/null +++ b/ALCF/test_alcf.sh @@ -0,0 +1,166 @@ +#!/bin/bash --login +# +# Run complete test of +# https://github.com/argonne-lcf/Megatron-DeepSpeed +# on {Polaris, Sunspot, Sirius} @ ALCF +# to launch (inside an interactive `qsub -I` job) on Polaris: +# +# ```bash` +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed/ALCF +# $ bash test_alcf.sh +# ```` + +# EXIT ON ERROR(s) +set -euxo pipefail + +NOW="$(date "+%Y-%m-%d-%H%M%S")" + +setup_conda_sunspot() { + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$(~/miniconda3/bin/conda shell hook -s posix)" + conda activate q4-drop + else + echo "Found existing python at: $(which python3)" + fi +} + +setup_conda_sirius() { + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then + export MAMBA_ROOT_PREFIX=/lus/tegu/projects/PolarisAT/foremans/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook --shell ${shell_name})" + micromamba activate 2024-04-23 + else + echo "Found existing python at: $(which python3)" + fi +} + +setup_conda_polaris() { + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then + # export CUDA_HOME=/soft/compilers/cudatoolkit/cuda-12.2.2 + # && export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba && eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" ; mm activate 2024-04-25 + export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" + micromamba activate 2024-04-25 + else + echo "Found existing python at: $(which python3)" + fi +} + + +function setEnv() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + # setup_conda + # ---- [SunSpot] ------- || ---- [Aurora] -------------- + if [[ $(hostname) == x1* || $(hostname) == x4* ]]; then + source "${WORKING_DIR}/ALCF/sunspot-env.sh" || exit + # ----- [Aurora] ----------------------------------- + if [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + if [[ $(hostname) == x4* ]]; then + eval "$(conda shell.zsh hook)" && conda activate anl_release_q4v2 + # ----- [SunSpot] ---------------------------------- + elif [[ $(hostname) == x1* ]]; then + echo "Running on SunSpot !!" + setup_conda_sunspot + # eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate q4-drop + fi + fi + # ----- [Polaris] --------------------------------------- + elif [[ $(hostname) == x3* ]]; then + if [[ "${PBS_O_HOST}" == sirius* ]]; then + echo "Running on Sirius !!" + setup_conda_sirius + else + echo "Running on Polaris !!" + # ---- [load conda] --------------------- + setup_conda_polaris + # if [[ -d "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221" ]]; then + # source "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221/bin/activate" + # fi + fi + elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then + echo "Running on Perlmutter !!" + module load pytorch + source "${SLURM_SUBMIT_DIR}/venvs/perlmutter/pytorch-2.1.0-cu12/bin/activate" + else # ------------------------------------- [Unknown] ------------------- + echo "Unknown hostname $(hostname)" + exit 1 + fi + else + echo "Unable to setup python environment. Exiting" + exit 1 + fi + echo "[python] Using: $(which python3)" +} + + + +######################################## +# Make sure ./tmp/Megatron-DeepSpeed +# does not already exist +######################################## +setup_megatron_deepspeed() { + OUTDIR="OUTPUTS/test-polaris-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + echo "Running test in: ${OUTDIR}" + echo "WORKING DIRECTORY: $(realpath $(pwd .))" + if [[ -d "Megatron-DeepSpeed" ]]; then + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." + exit + fi + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + if [[ -n "${GIT_BRANCH-}" ]]; then + git checkout "${GIT_BRANCH}" + fi +} + + +main() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + setup_conda + else + echo "Unable to setup python. Exiting" + exit 1 + fi + setup_megatron_deepspeed + export DEBUG=1 + export PBS_O_WORKDIR="$(pwd)" + SUBMITTED_FROM=$(echo $PBS_O_HOST | tr '-' ' ' | awk '{print $1}') + export DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/${SUBMITTED_FROM}/books.txt" + if [[ ! -f "${DATA_FILE_LIST}" ]]; then + echo "Unable to find / use ${DATA_FILE_LIST}. Exiting." + exit 1 + fi + # export ZERO_STAGE=1 + # export NUM_LAYERS=10 + # export MICRO_BATCH=8 + export TRAIN_ITER=20 + export TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee "test-${SUBMITTED_FROM}-${NOW}".log +} + +main + diff --git a/ALCF/test_blend.sh b/ALCF/test_blend.sh new file mode 100755 index 0000000000..9073d2a58c --- /dev/null +++ b/ALCF/test_blend.sh @@ -0,0 +1,73 @@ +#!/bin/bash +#PBS -l walltime=0:30:00 +#PBS -A datascience +#PBS -q debug +#PBS -l select=1 +#PBS -l filesystems=eagle:grand:home +cd ${PBS_O_WORKDIR} +export PPN=4 +export MD=/home/hzheng/ALCF-Megatron-DeepSpeed +module load conda/2023-10-04 +#conda activate /soft/datascience/megatron-deepspeed/2023-10-04 +conda activate $HOME/PolarisAT/pyenvs/megatron/2023-10-04 +export TP=1 +export PP=1 +export SP=128 +export MBS=1 +export BS=$((MBS*SP)) +export export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") +export DATA_FILE_LIST="/eagle/datasets//dolma/chunks-merge/data_file_list_chunk_1_of_4.txt" + +HIDDEN_SIZE=4096 +NUM_LAYERS=32 +SEQ_LENGTH=2048 +EMBEDDINGS=2048 +TRAIN_ITERS=10 +ZERO_STAGE=2 +MODEL=LLAMA_7B +OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} +#MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --cpu-bind depth -d 16 --hostfile $PBS_NODEFILE +python3 ./test_blendable_dataset.py \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --num-layers ${NUM_LAYERS} \ + --hidden-size ${HIDDEN_SIZE} \ + --ffn-hidden-size 5504 \ + --num-attention-heads 32 \ + --micro-batch-size ${MBS} \ + --global-batch-size ${BS} \ + --seq-length ${SEQ_LENGTH} \ + --max-position-embeddings ${EMBEDDINGS} \ + --train-iters 80797 \ + --save ${MD}/checkpoints/${OUTPUT_PREFIX} \ + --load ${MD}/checkpoints/${OUTPUT_PREFIX} \ + --tokenizer-type Llama2Tokenizer \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 3e-4 \ + --lr-decay-style cosine \ + --min-lr 3e-5 \ + --weight-decay 0.1 \ + --clip-grad 1 \ + --lr-warmup-iters 2 \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --log-interval 1 \ + --cpu-optimizer \ + --save-interval 10000 \ + --eval-interval 1000 \ + --eval-iters 10 --fp16 \ + --no-query-key-layer-scaling \ + --attention-dropout 0 \ + --hidden-dropout 0 \ + --use-rotary-position-embeddings \ + --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ + --untie-embeddings-and-output-weights \ + --swiglu --normalization layernorm --disable-bias-linear --num-key-value-heads 4 \ + --tensorboard-dir ./outputs/${OUTPUT_PREFIX}/tensorboard --log-timers-to-tensorboard --tensorboard-log-interval 1 \ + --data-file-list ${DATA_FILE_LIST} \ + --data-path ${DATA_PATH} \ + --data-cache-path /tmp/hzheng-megatron-deepspeed-cache/ \ + --vocab-file ${MD}/dataset/gpt2-vocab.json --merge-file ${MD}/dataset/gpt2-merges.txt \ + --zero-stage=${ZERO_STAGE} --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed diff --git a/ALCF/test_blend_full.sh b/ALCF/test_blend_full.sh new file mode 100755 index 0000000000..459652a2ee --- /dev/null +++ b/ALCF/test_blend_full.sh @@ -0,0 +1,73 @@ +#!/bin/bash +#PBS -l walltime=0:30:00 +#PBS -A datascience +#PBS -q debug +#PBS -l select=1 +#PBS -l filesystems=eagle:grand:home +cd ${PBS_O_WORKDIR} +export PPN=4 +export MD=/eagle/argonne_tpc/soft/Megatron-DeepSpeed +source /eagle/argonne_tpc/soft/conda.sh +export TRITON_CACHE_DIR=/tmp/.cache/ + +export TP=1 +export PP=1 +export SP=128 +export MBS=1 +export BS=$((MBS*SP)) +export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") +export DATA_FILE_LIST="/eagle/datasets//dolma/data_file_list_reweighted.txt" + +HIDDEN_SIZE=4096 +NUM_LAYERS=32 +SEQ_LENGTH=2048 +EMBEDDINGS=2048 +TRAIN_ITERS=80797 +ZERO_STAGE=2 +MODEL=LLAMA_7B +export PBS_JOBSIZE=$(cat $PBS_NODEFILE | uniq | wc -l) +OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} +APRUN_PMI=pmix aprun -n $((PBS_JOBSIZE*PPN)) -N $PPN --cc depth -d 16 ${MD}/local_rank.sh python3 ALCF/test_blendable_dataset.py \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --num-layers ${NUM_LAYERS} \ + --hidden-size ${HIDDEN_SIZE} \ + --ffn-hidden-size 5504 \ + --num-attention-heads 32 \ + --micro-batch-size ${MBS} \ + --global-batch-size ${BS} \ + --seq-length ${SEQ_LENGTH} \ + --max-position-embeddings ${EMBEDDINGS} \ + --train-iters ${TRAIN_ITERS} \ + --save ${MD}/checkpoints/${OUTPUT_PREFIX} \ + --load ${MD}/checkpoints/${OUTPUT_PREFIX} \ + --tokenizer-type Llama2Tokenizer \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 3e-4 \ + --lr-decay-style cosine \ + --min-lr 3e-5 \ + --weight-decay 0.1 \ + --clip-grad 1 \ + --lr-warmup-iters 2 \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --log-interval 1 \ + --cpu-optimizer \ + --save-interval 10000 \ + --eval-interval 1000 \ + --eval-iters 10 --fp16 \ + --no-query-key-layer-scaling \ + --attention-dropout 0 \ + --hidden-dropout 0 \ + --use-rotary-position-embeddings \ + --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ + --untie-embeddings-and-output-weights \ + --swiglu --normalization layernorm --disable-bias-linear --num-key-value-heads 4 \ + --tensorboard-dir ./outputs/${OUTPUT_PREFIX}/tensorboard --log-timers-to-tensorboard --tensorboard-log-interval 1 \ + --data-file-list ${DATA_FILE_LIST} \ + --data-path ${DATA_PATH} \ + --data-cache-path /tmp/hzheng-megatron-deepspeed-cache/ \ + --vocab-file ${MD}/dataset/gpt2-vocab.json --merge-file ${MD}/dataset/gpt2-merges.txt \ + --zero-stage=${ZERO_STAGE} --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed diff --git a/ALCF/test_blendable_dataset.py b/ALCF/test_blendable_dataset.py new file mode 100644 index 0000000000..9681198251 --- /dev/null +++ b/ALCF/test_blendable_dataset.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +from megatron.data.gpt_dataset import build_train_valid_test_datasets +import numpy as np +from megatron.global_vars import set_args, set_global_variables, get_args +from megatron.arguments import parse_args +from megatron.initialize import initialize_megatron +from megatron.data.data_samplers import build_pretraining_data_loader +from mpi4py import MPI +from megatron.core import mpu +comm = MPI.COMM_WORLD +initialize_megatron(allow_no_cuda=True) +args = get_args() + +data_file_list = args.data_file_list +if comm.rank==0: + print(f"Reading data from {args.data_file_list}") +files = [] +weights = [] +flist = [] +with open(data_file_list, 'r') as fin: + for f in fin.readlines(): + w, fname = f.split() + weights.append(float(w)) + flist.append(fname) + files.append(float(w)) + files.append(fname) +splits_string="100,0,0" + +weights = np.array(weights) +weights = weights/np.sum(weights) + +num_samples = args.global_batch_size*args.train_iters +num_datasets = len(weights) +if comm.rank==0: + print(f"Number of datasets: {num_datasets}") + print(f"Global batch size: {args.global_batch_size}") + print(f"Training iterations: {args.train_iters}") +train_valid_test_num_samples = [num_samples, 0, 0] +seed=args.seed +data_impl = args.data_impl +skip_warmup = not args.mmap_warmup +seq_length = args.seq_length +splits_string = "1,0,0" + +# Build datasets +train_ds, valid_ds, test_ds = build_train_valid_test_datasets(files, data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup, data_cache_path=args.data_cache_path) + +dataset_idx = [train_ds.dataset_index[i] for i in range(num_samples)] +ratio_select=np.zeros(num_datasets) +#for i in range(num_datasets): +# ratio_select[i] = np.sum([i==d for d in dataset_idx])/num_samples +if comm.rank ==0: + print(f"Total number of samples: {len(train_ds)}") + print(f"Weights set: {weights[:min(8, num_datasets)]}") +#print(f"Weights across training: {ratio_select[:min(8, num_datasets)]}") + +for e in range(min(100, args.train_iters)): + ratio_select=np.zeros(num_datasets) + for i in range(num_datasets): + ratio_select[i] = np.sum([i==d for d in dataset_idx[e*args.global_batch_size:(e+1)*args.global_batch_size]])/args.global_batch_size + if comm.rank==0: + print(f"iter-{e}: {ratio_select[:min(8, num_datasets)]}") + + +print("First 10 samples") +for i in range(10): + if comm.rank==0: + print(f"Sample: {i} \t dataset_idx: {train_ds.dataset_index[i]}, sample_idx: {train_ds.dataset_sample_index[i]}") + +#### Build data loaders +rank_in_parallel_group = mpu.get_sequence_parallel_rank() +print(rank_in_parallel_group) +if rank_in_parallel_group == 0: + train_dataloader = build_pretraining_data_loader( + train_ds, args.consumed_train_samples) + valid_dataloader = build_pretraining_data_loader( + valid_ds, args.consumed_valid_samples) + test_dataloader = build_pretraining_data_loader(test_ds, 0) diff --git a/ALCF/test_polaris.sh b/ALCF/test_polaris.sh new file mode 100644 index 0000000000..a18c87fad7 --- /dev/null +++ b/ALCF/test_polaris.sh @@ -0,0 +1,88 @@ +#!/bin/bash --login +# +# Run complete test of +# https://github.com/argonne-lcf/Megatron-DeepSpeed +# on Polaris @ ALCF +# to launch (inside an interactive `qsub -I` job) on Polaris: +# +# ```bash` +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed/ALCF +# $ bash test_polaris.sh +# ```` + +# EXIT ON ERROR(s) +set -euxo pipefail + +NOW="$(date "+%Y-%m-%d-%H%M%S")" + +######################################################## +# Setup / activate conda environment, +# mine is called q4-drop +######################################################## +setup_conda() { + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then + export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" + micromamba activate 2024-04-25 + else + echo "Found existing python at: $(which python3)" + fi +} + + +######################################## +# Make sure ./tmp/Megatron-DeepSpeed +# does not already exist +######################################## +setup_megatron_deepspeed() { + OUTDIR="OUTPUTS/test-polaris-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + echo "Running test in: ${OUTDIR}" + echo "WORKING DIRECTORY: $(realpath $(pwd .))" + if [[ -d "Megatron-DeepSpeed" ]]; then + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." + exit + fi + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + if [[ -n "${GIT_BRANCH-}" ]]; then + git checkout "${GIT_BRANCH}" + fi +} + + +main() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + setup_conda + else + echo "Unable to setup python. Exiting" + exit 1 + fi + setup_megatron_deepspeed + export DEBUG=1 + export PBS_O_WORKDIR="$(pwd)" + export DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/polaris/books.txt" + if [[ ! -f "${DATA_FILE_LIST}" ]]; then + echo "Unable to find / use ${DATA_FILE_LIST}. Exiting." + exit 1 + fi + export ZERO_STAGE=1 + export NUM_LAYERS=10 + export MICRO_BATCH=8 + export TRAIN_ITER=20 + export TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee "test-polaris-${NOW}".log +} + +main diff --git a/ALCF/test_sirius.sh b/ALCF/test_sirius.sh new file mode 100755 index 0000000000..0a528a9519 --- /dev/null +++ b/ALCF/test_sirius.sh @@ -0,0 +1,88 @@ +#!/bin/bash --login +# +# Run complete test of +# https://github.com/argonne-lcf/Megatron-DeepSpeed +# on Sirius @ ALCF +# to launch (inside an interactive `qsub -I` job) on Sirius: +# +# ```bash` +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed/ALCF +# $ bash test_sirius.sh +# ```` + +# EXIT ON ERROR(s) +set -euxo pipefail + +NOW="$(date "+%Y-%m-%d-%H%M%S")" + +######################################################## +# Setup / activate conda environment, +# mine is called q4-drop +######################################################## +setup_conda() { + if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then + export MAMBA_ROOT_PREFIX=/lus/tegu/projects/PolarisAT/foremans/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook --shell ${shell_name})" + micromamba activate 2024-04-23 + else + echo "Found existing python at: $(which python3)" + fi +} + + +######################################## +# Make sure ./tmp/Megatron-DeepSpeed +# does not already exist +######################################## +setup_megatron_deepspeed() { + OUTDIR="OUTPUTS/test-sirius-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + echo "Running test in: ${OUTDIR}" + echo "WORKING DIRECTORY: $(realpath $(pwd .))" + if [[ -d "Megatron-DeepSpeed" ]]; then + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." + exit + fi + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + if [[ -n "${GIT_BRANCH-}" ]]; then + git checkout "${GIT_BRANCH}" + fi +} + + +main() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + setup_conda + else + echo "Unable to setup python. Exiting" + exit 1 + fi + setup_megatron_deepspeed + export DEBUG=1 + export PBS_O_WORKDIR="$(pwd)" + export DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/sirius/books.txt" + if [[ ! -f "${DATA_FILE_LIST}" ]]; then + echo "Unable to find / use ${DATA_FILE_LIST}. Exiting." + exit 1 + fi + export ZERO_STAGE=1 + export NUM_LAYERS=10 + export MICRO_BATCH=8 + export TRAIN_ITER=20 + export TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee "test-sirius-${NOW}".log +} + +main diff --git a/ALCF/test_sunspot.sh b/ALCF/test_sunspot.sh new file mode 100755 index 0000000000..b3b22c78b4 --- /dev/null +++ b/ALCF/test_sunspot.sh @@ -0,0 +1,87 @@ +#!/bin/bash --login +# +# Run complete test of +# https://github.com/argonne-lcf/Megatron-DeepSpeed +# on Sunspot @ ALCF +# to launch (inside an interactive `qsub -I` job) on Sirius: +# +# ```bash +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed/ALCF +# $ bash test_sunspot.sh +# ```` + +# EXIT ON ERROR(s) +set -euxo pipefail + +NOW="$(date "+%Y-%m-%d-%H%M%S")" + +######################################################## +# Setup / activate conda environment, +# mine is called q4-drop +######################################################## +setup_conda() { + if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$(~/miniconda3/bin/conda shell hook -s posix)" + conda activate q4-drop + else + echo "Found existing python at: $(which python3)" + fi +} + + +######################################## +# Make sure ./tmp/Megatron-DeepSpeed +# does not already exist +######################################## +setup_megatron_deepspeed() { + OUTDIR="OUTPUTS/test-sunspot-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + echo "Running test in: ${OUTDIR}" + echo "WORKING DIRECTORY: $(realpath $(pwd .))" + if [[ -d "Megatron-DeepSpeed" ]]; then + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." + exit + fi + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + if [[ -n "${GIT_BRANCH-}" ]]; then + git checkout "${GIT_BRANCH}" + fi +} + + +main() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + setup_conda + else + echo "Unable to setup python. Exiting" + exit 1 + fi + setup_megatron_deepspeed + export DEBUG=1 + export PBS_O_WORKDIR="$(pwd)" + export DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/sunspot/books.txt" + if [[ ! -f "${DATA_FILE_LIST}" ]]; then + echo "Unable to find / use ${DATA_FILE_LIST}. Exiting." + exit 1 + fi + export ZERO_STAGE=1 + export NUM_LAYERS=10 + export MICRO_BATCH=8 + export TRAIN_ITER=20 + export TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee "test-sunspot-${NOW}.log" +} + +main diff --git a/ALCF/tokenizer.model b/ALCF/tokenizer.model new file mode 100644 index 0000000000..22bccbcb41 Binary files /dev/null and b/ALCF/tokenizer.model differ diff --git a/megatron/arguments.py b/megatron/arguments.py index dad993be04..c774759ae3 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -800,7 +800,7 @@ def _add_training_args(parser): ' ' ' ' 'For example:' - ' --rampup-batch-size 16 8 300000 \ ' + ' --rampup-batch-size 16 8 300000 \\ ' ' --global-batch-size 1024' 'will start with global batch size 16 and over ' ' (1024 - 16) / 8 = 126 intervals will increase' @@ -915,9 +915,37 @@ def _add_training_args(parser): group.add_argument('--disable-bias-linear', action='store_false', help='Disable bias in the linear layers', dest='add_bias_linear') - group.add_argument('--optimizer', type=str, default='adam', - choices=['adam', 'sgd'], - help='Optimizer function') + group.add_argument( + '--optimizer', + type=str, + default='adam', + choices=[ + 'adam', + 'adamw', + 'sgd', + 'ds.fusedlamb', + 'ipex.lamb', + 'ipex.fusedlamb', + 'apex.adam', + 'apex.sgd', + 'adamwschedulefree', + 'sgdschedulefree', + 'galoreadamw', + 'adam8bit', + 'galoreadamw8bit', + 'galoreadamw8bitperlayer' + ], + help='Optimizer function' + ) + group.add_argument( + "--schedulefree-for-each", + action="store_true", + help=""" + Use a foreach-backed implementation of the schedulefree optimizers. + Should be significantly faster, + but will have a higher peak memory usage. + """, + ) group.add_argument('--dataloader-type', type=str, default=None, choices=['single', 'cyclic'], help='Single pass vs multiple pass data loader') @@ -960,6 +988,12 @@ def _add_training_args(parser): dest='gradient_accumulation_fusion') group.add_argument('--use-dataset-only', type=bool, required=False, default=False, help='If set to True, only use the megatron dataset for external trainer ') + group.add_argument('--profile', action='store_true', help='Enable Torch Profiler') + group.add_argument('--train-iters-to-skip', action="extend", nargs="+", type=str, + help=( + "Specific train iterations to skip when training. " + "Load the data and just perform a noop." + )) return parser @@ -1213,6 +1247,9 @@ def _add_data_args(parser): 'single dataset used for all three: train, valid ' 'and test. It is exclusive to the other ' '--*-data-path args') + group.add_argument('--data-file-list', type=str, default=None, + help='The file with the list of dataset and weights') + group.add_argument('--split', type=str, default='969, 30, 1', help='Comma-separated list of proportions for training,' ' validation, and test split. For example the split ' @@ -1274,7 +1311,8 @@ def _add_data_args(parser): 'SentencePieceTokenizer', 'GPTSentencePieceTokenizer', 'HFTokenizer', - 'NullTokenizer'], + 'NullTokenizer', + 'Llama2Tokenizer'], help='What type of tokenizer to use.') group.add_argument('--tokenizer-model', type=str, default=None, help='Sentencepiece tokenizer model.') @@ -1307,6 +1345,7 @@ def _add_data_args(parser): help='Force to use certain index file.') group.add_argument('--repeated-dataloader', action='store_true', help='Once all the data has been loaded, reuse the DataLoader.') + group.add_argument('--multiprocessing-context', type=str, default='fork') return parser @@ -1507,7 +1546,6 @@ def _add_activation_checkpoint_args(parser): def _add_distillation_args(parser): group = parser.add_argument_group('Knowledge distillation', 'Distillation Configurations') - group.add_argument('--num-layers-teacher', type=int, default=None, help='Number of the teacher transformer layers.') group.add_argument('--num-experts-teacher', type=int, nargs='+', default=[1,], @@ -1516,7 +1554,6 @@ def _add_distillation_args(parser): help='Tansformer teacher hidden size.') group.add_argument('--num-attention-heads-teacher', type=int, default=None, help='Number of teacher transformer attention heads.') - group.add_argument('--mos', action='store_true', help='Enable Mixture-of-Students via knolwedge distillation.') group.add_argument('--kd', action='store_true', @@ -1526,7 +1563,6 @@ def _add_distillation_args(parser): group.add_argument('--kd-temp', default=1.0, type=float) group.add_argument('--reset-iteration', action='store_true', help='Reset the iteration count.') - group.add_argument('--load-teacher', type=str, default=None, help='Directory containing a teacher model checkpoint.') diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py index 3e4d20035e..d585baf717 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -8,6 +8,12 @@ import numpy as np from deepspeed.accelerator import get_accelerator import torch +import ezpz as ez +import logging +import torch.distributed as tdist + +import yaml +from pathlib import Path from megatron import update_num_microbatches, get_tokenizer from megatron.core import mpu, tensor_parallel @@ -24,6 +30,12 @@ UNIVERSAL_CHECKPOINT_VERSION_VALUE, ) +RANK = ez.get_rank() +WORLD_SIZE = ez.get_world_size() +DEVICE = ez.get_torch_device() +log = logging.getLogger(__name__) +log.setLevel("INFO") if RANK == 0 else log.setLevel("CRITICAL") + _CHECKPOINT_VERSION = None @@ -225,6 +237,10 @@ def get_rng_state(): def save_checkpoint(iteration, model, optimizer, opt_param_scheduler): """Save a model checkpoint.""" args = get_args() + assert args is not None + iteration = args.iteration + + save_lr_state_dict() # Only rank zero of the data parallel writes to the disk. if not args.deepspeed: @@ -529,15 +545,81 @@ def _set_arg(arg_name, old_arg_name=None, force=False): return args, checkpoint_args -def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', strict=True, load_only_weights=False): +def load_lr_state_dict(strict: bool = False) -> dict: + """Load {iteration, lr} from .yaml file when restoring from checkpoint.""" + args = get_args() + assert args is not None + lr_state_dict_fp = Path(args.load).joinpath( + f"lr_state_dict_{RANK}_of_{WORLD_SIZE}.yaml" + ) + lr_state_dict = {} + if lr_state_dict_fp.is_file(): + with lr_state_dict_fp.open('r') as f: + lr_state_dict = yaml.safe_load(f) + args.lr = lr_state_dict['lr'] + else: + if strict: + raise FileNotFoundError( + f"{lr_state_dict_fp=}.is_file() is False" + ) + log.info( + f"Unable to load lr_state_dict from {lr_state_dict_fp=}, " + f"but strict=False. Returning empty dictionary: {lr_state_dict=}" + ) + return lr_state_dict + + +def save_lr_state_dict() -> None: + """Save {iteration, lr} to .yaml file for safe-keeping. + + Make sure we're only saving from RANK == 0. + """ + if RANK != 0: + return None + args = get_args() + assert args is not None + outdir = getattr(args, 'save', None) + assert outdir is not None + lr_state_dict_fp = Path(args.save).joinpath( + "lr_state_dict.yaml" + ) + log.info(f"Saving lr_state_dict to {lr_state_dict_fp.as_posix()}") + with lr_state_dict_fp.open('w') as f: + yaml.dump( + {'iteration': args.iteration, 'lr': args.lr}, + f + ) + + +def load_checkpoint( + model, + optimizer, + opt_param_scheduler, + load_arg: str = 'load', + strict: bool = True, + load_only_weights: bool = False, + strict_lr_state_dict: bool = False +): """Load a model checkpoint and return the iteration. strict (bool): whether to strictly enforce that the keys in :attr:`state_dict` of the checkpoint match the names of parameters and buffers in model. """ args = get_args() + assert args is not None load_dir = getattr(args, load_arg) - + lr_state_dict = {} + lr_tensor = torch.tensor(args.lr, requires_grad=False, device=DEVICE) + if RANK == 0: + lr_state_dict = load_lr_state_dict(strict=strict_lr_state_dict) + if len(lr_state_dict.keys()) > 0 and 'lr' in lr_state_dict: + lr_tensor = torch.tensor( + lr_state_dict['lr'], + requires_grad=False, + device=DEVICE, + ) + tdist.broadcast(lr_tensor, 0) + args.lr = lr_tensor.item() if args.deepspeed: if args.finetune: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, @@ -553,7 +635,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri print_rank_0(' will not load any checkpoints and will start from ' 'random') return 0 - release = False + release = False else: model = unwrap_model(model) diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index 407bb16d56..453019f8c9 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -14,7 +14,7 @@ from megatron.core.enums import ModelType from megatron.core.utils import get_attr_wrapped_model, get_model_type, get_model_config -from megatron.utils import unwrap_model +from megatron.utils import print_rank_0, unwrap_model from megatron.model import DistributedDataParallel as LocalDDP from megatron.model import Float16Module @@ -228,7 +228,14 @@ def forward_step(forward_step_func, return [output_tensor] -def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config, model=None): +def backward_step( + input_tensor, + output_tensor, + output_tensor_grad, + model_type, + config, + model=None +): """Backward step through passed-in output tensor. If last stage, output_tensor_grad is None, otherwise gradient of loss @@ -241,12 +248,19 @@ def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, c # needs to be modified slightly to support arbitrary numbers of skip # connections. args = get_args() - if args.deepspeed: - assert model is not None - + assert args is not None if config.timers is not None: config.timers('backward-compute', log_level=2).start() - + if (to_skip := getattr(args, 'train_iters_to_skip', None)) is not None: + if config.timers is not None: + config.timers('backward-compute').stop() + if len(to_skip) > 0 and args.iteration in [int(i) for i in to_skip]: + print_rank_0( + f'Caught {args.iteration=} in `iters_to_skip`! Skipping!' + ) + return [None] + if args.deepspeed: + assert model is not None # Retain the grad on the input_tensor. unwrap_input_tensor_grad = False if not isinstance(input_tensor, list): @@ -255,24 +269,20 @@ def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, c for x in input_tensor: if x is not None: x.retain_grad() - if not isinstance(output_tensor, list): output_tensor = [output_tensor] if not isinstance(output_tensor_grad, list): output_tensor_grad = [output_tensor_grad] - # Backward pass. if args.deepspeed: model.backward(output_tensor[0]) else: if output_tensor_grad[0] is None and config.grad_scale_func is not None: output_tensor[0] = config.grad_scale_func(output_tensor[0]) - if config.deallocate_pipeline_outputs: custom_backward(output_tensor[0], output_tensor_grad[0]) else: torch.autograd.backward(output_tensor[0], grad_tensors=output_tensor_grad[0]) - # Collect the grad of the input_tensor. input_tensor_grad = [None] if input_tensor is not None: @@ -282,7 +292,6 @@ def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, c input_tensor_grad.append(None) else: input_tensor_grad.append(x.grad) - # Handle single skip connection if it exists (encoder_hidden_state in # model with encoder and decoder). if parallel_state.get_pipeline_model_parallel_world_size() > 1 and \ @@ -292,10 +301,8 @@ def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, c input_tensor_grad[-1].add_(output_tensor_grad[1]) if unwrap_input_tensor_grad: input_tensor_grad = input_tensor_grad[0] - if config.timers is not None: config.timers('backward-compute').stop() - return input_tensor_grad diff --git a/megatron/core/tensor_parallel/cross_entropy.py b/megatron/core/tensor_parallel/cross_entropy.py index 9dcdc0459f..d0453d25ea 100644 --- a/megatron/core/tensor_parallel/cross_entropy.py +++ b/megatron/core/tensor_parallel/cross_entropy.py @@ -69,14 +69,14 @@ def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0): vocab_size = exp_logits.size(-1) if label_smoothing > 0: - """ + r""" We'd like to assign 1 / (K - 1) probability mass to every index that is not the ground truth. = (1 - alpha) * y_gt + alpha * mean(y_{i for i != gt}) = (1 - alpha) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i = ((K - 1) * (1 - alpha) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i = (K * (1 - alpha) - 1) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i} y_i = (1 - (alpha * K) / (K - 1)) * y_gt + ( (alpha * K) / (K - 1) ) * \sum_{i} y_i / K - From: https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/losses/smoothed_cross_entropy.py + From: """ assert 1.0 > label_smoothing > 0.0 smoothing = label_smoothing * vocab_size / (vocab_size - 1) diff --git a/megatron/data/blendable_dataset.py b/megatron/data/blendable_dataset.py index 2516e58415..f3276c6823 100644 --- a/megatron/data/blendable_dataset.py +++ b/megatron/data/blendable_dataset.py @@ -34,8 +34,7 @@ def __init__(self, datasets, weights, size, *, # Build indicies. def _build_indices(): start_time = time.time() - assert num_datasets < 255 - dataset_index = np.zeros(self.size, dtype=np.uint8) + dataset_index = np.zeros(self.size, dtype=np.int64) dataset_sample_index = np.zeros(self.size, dtype=np.int64) from megatron.data import helpers diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 2d7da67e15..b242101b3a 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -2,7 +2,6 @@ """Dataloaders.""" - import random import torch import numpy as np @@ -41,38 +40,55 @@ def build_pretraining_data_loader(dataset, consumed_samples): args.dataloader_type)) # Torch dataloader. - loader = torch.utils.data.DataLoader(dataset, - batch_sampler=batch_sampler, - num_workers=args.num_workers, - pin_memory=True) + loader = torch.utils.data.DataLoader( + dataset, + batch_sampler=batch_sampler, + num_workers=args.num_workers, + pin_memory=True, + multiprocessing_context=( + args.multiprocessing_context if args.num_workers > 0 + else None + ) + ) if args.repeated_dataloader: loader=RepeatingLoader(loader) return loader class MegatronPretrainingSampler: - def __init__(self, total_samples, consumed_samples, micro_batch_size, - data_parallel_rank, data_parallel_size, drop_last=True): + def __init__( + self, + total_samples, + consumed_samples, + micro_batch_size, + data_parallel_rank, + data_parallel_size, + drop_last=True + ): # Keep a copy of input params for later use. self.total_samples = total_samples self.consumed_samples = consumed_samples self.micro_batch_size = micro_batch_size self.data_parallel_rank = data_parallel_rank - self.micro_batch_times_data_parallel_size = \ + self.micro_batch_times_data_parallel_size = ( self.micro_batch_size * data_parallel_size + ) self.drop_last = drop_last # Sanity checks. - assert self.total_samples > 0, \ - 'no sample to consume: {}'.format(self.total_samples) - assert self.consumed_samples < self.total_samples, \ - 'no samples left to consume: {}, {}'.format(self.consumed_samples, - self.total_samples) + assert self.total_samples > 0, ( + f'no sample to consume: {self.total_samples}' + ) + assert self.consumed_samples < self.total_samples, ( + 'no samples left to consume: ' + f'{self.consumed_samples}, {self.total_samples}' + ) assert self.micro_batch_size > 0 assert data_parallel_size > 0 - assert self.data_parallel_rank < data_parallel_size, \ - 'data_parallel_rank should be smaller than data size: {}, ' \ - '{}'.format(self.data_parallel_rank, data_parallel_size) + assert self.data_parallel_rank < data_parallel_size, ( + f'data_parallel_rank should be smaller than data size: ' + f'{self.data_parallel_rank}, {data_parallel_size}' + ) def __len__(self): return self.total_samples @@ -122,8 +138,16 @@ def __getitem__(self, idx): class MegatronPretrainingRandomSampler: - def __init__(self, dataset, total_samples, consumed_samples, micro_batch_size, - data_parallel_rank, data_parallel_size, data_sharding): + def __init__( + self, + dataset, + total_samples, + consumed_samples, + micro_batch_size, + data_parallel_rank, + data_parallel_size, + data_sharding + ): # Keep a copy of input params for later use. self.dataset = dataset self.total_samples = total_samples @@ -132,19 +156,23 @@ def __init__(self, dataset, total_samples, consumed_samples, micro_batch_size, self.data_parallel_rank = data_parallel_rank self.data_parallel_size = data_parallel_size self.data_sharding = data_sharding - self.micro_batch_times_data_parallel_size = \ + self.micro_batch_times_data_parallel_size = ( self.micro_batch_size * data_parallel_size - self.last_batch_size = \ + ) + self.last_batch_size = ( self.total_samples % self.micro_batch_times_data_parallel_size + ) # Sanity checks. - assert self.total_samples > 0, \ - 'no sample to consume: {}'.format(self.total_samples) + assert self.total_samples > 0, ( + f'no sample to consume: {self.total_samples}' + ) assert self.micro_batch_size > 0 assert data_parallel_size > 0 - assert self.data_parallel_rank < data_parallel_size, \ - 'data_parallel_rank should be smaller than data size: {}, ' \ - '{}'.format(self.data_parallel_rank, data_parallel_size) + assert self.data_parallel_rank < data_parallel_size, ( + f'data_parallel_rank should be smaller than data size: ' + f'{self.data_parallel_rank}, {data_parallel_size}' + ) def __len__(self): return self.total_samples @@ -160,23 +188,31 @@ def __iter__(self): # data sharding and random sampling if self.data_sharding: - bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \ - * self.micro_batch_size + bucket_size = ( + self.micro_batch_size * ( + self.total_samples + // self.micro_batch_times_data_parallel_size + ) + ) bucket_offset = current_epoch_samples // self.data_parallel_size start_idx = self.data_parallel_rank * bucket_size - g = torch.Generator() g.manual_seed(self.epoch) random_idx = torch.randperm(bucket_size, generator=g).tolist() idx_range = [start_idx + x for x in random_idx[bucket_offset:]] else: - full_bucket_size = (self.total_samples // self.micro_batch_size) \ - * self.micro_batch_size + full_bucket_size = ( + self.micro_batch_size * ( + self.total_samples + // self.micro_batch_size + ) + ) full_bucket_offset = current_epoch_samples g = torch.Generator() g.manual_seed(self.epoch) - idx_range_total = \ + idx_range_total = ( torch.randperm(full_bucket_size, generator=g).tolist() + ) idx_range_active = idx_range_total[full_bucket_offset:] idx_range = idx_range_active[self.data_parallel_rank::self.data_parallel_size] @@ -187,4 +223,4 @@ def __iter__(self): if len(batch) == self.micro_batch_size: self.consumed_samples += self.micro_batch_times_data_parallel_size yield batch - batch = [] \ No newline at end of file + batch = [] diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py index 1d9b7e1c1d..210a92c85e 100644 --- a/megatron/data/gpt_dataset.py +++ b/megatron/data/gpt_dataset.py @@ -273,7 +273,31 @@ def __getitem__(self, idx): args = get_args() orig_idx = idx # Get the shuffled index. - idx = self.shuffle_idx[idx] + try: + idx = self.shuffle_idx[idx] + except IndexError as exc: + if is_rank_0(): + import json + from rich import print_json + print(exc) + print( + '\n'.join( + ['-------------------------------------------------', + f'Trying to access {idx=} from self.shuffle_idx,', + f'but {len(self.shuffle_idx)=}', + '-------------------------------------------------'] + ) + ) + print_json( + json.dumps( + { + 'doc_idx': len(self.doc_idx), + 'sample_idx': len(self.sample_idx), + 'shuffle_idx': len(self.shuffle_idx), + }, + indent=4, + ) + ) # Start and end documents and offsets. doc_index_f = self.sample_idx[idx][0] doc_index_l = self.sample_idx[idx + 1][0] diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp index 5c3a054875..142f159dd3 100644 --- a/megatron/data/helpers.cpp +++ b/megatron/data/helpers.cpp @@ -17,10 +17,10 @@ using namespace std; const int32_t LONG_SENTENCE_LEN = 512; -void build_blending_indices(py::array_t& dataset_index, +void build_blending_indices(py::array_t& dataset_index, py::array_t& dataset_sample_index, const py::array_t& weights, - const int32_t num_datasets, + const int64_t num_datasets, const int64_t size, const bool verbose) { /* Given multiple datasets and a weighting array, build samples such that it follows those wieghts.*/ @@ -58,7 +58,7 @@ void build_blending_indices(py::array_t& dataset_index, } // Populate the indices. - dataset_index_ptr[sample_idx] = static_cast(max_error_index); + dataset_index_ptr[sample_idx] = static_cast(max_error_index); dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index]; // Update the total samples. diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py index 219ffe8031..62ebdc9813 100644 --- a/megatron/data/indexed_dataset.py +++ b/megatron/data/indexed_dataset.py @@ -504,6 +504,7 @@ def _do_init(self, path, skip_warmup): print_rank_0(" warming up data mmap file...") _warmup_mmap_file(data_file_path(self._path)) print_rank_0(" creating numpy buffer of mmap...") + print_rank_0(data_file_path(self._path)) self._bin_buffer_mmap = np.memmap(data_file_path(self._path), mode='r', order='C') print_rank_0(" creating memory view of numpy buffer...") self._bin_buffer = memoryview(self._bin_buffer_mmap) diff --git a/megatron/global_vars.py b/megatron/global_vars.py index cb284b3c34..9f833fbd19 100644 --- a/megatron/global_vars.py +++ b/megatron/global_vars.py @@ -96,7 +96,7 @@ def set_global_variables(args): if args.exit_signal_handler: _set_signal_handler() - + def set_args(args): global _GLOBAL_ARGS @@ -137,11 +137,10 @@ def _set_tensorboard_writer(args): global _GLOBAL_TENSORBOARD_WRITER _ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER, 'tensorboard writer') - if hasattr(args, 'tensorboard_dir') and \ args.tensorboard_dir and args.rank == (args.world_size - 1): try: - from torch.utils.tensorboard import SummaryWriter + from torch.utils.tensorboard.writer import SummaryWriter print('> setting tensorboard ...') _GLOBAL_TENSORBOARD_WRITER = SummaryWriter( log_dir=args.tensorboard_dir, @@ -179,9 +178,9 @@ def _set_timers(args): def _ensure_var_is_initialized(var, name): """Make sure the input variable is not None.""" - assert var is not None, '{} is not initialized.'.format(name) + assert var is not None, f'{name} is not initialized.' def _ensure_var_is_not_initialized(var, name): """Make sure the input variable is not None.""" - assert var is None, '{} is already initialized.'.format(name) + assert var is None, f'{name} is already initialized.' diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py index 2306749fcb..141c901ffa 100644 --- a/megatron/model/__init__.py +++ b/megatron/model/__init__.py @@ -1,12 +1,18 @@ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -from deepspeed.accelerator.real_accelerator import get_accelerator -if get_accelerator().device_name() == 'cuda': +# from deepspeed.accelerator.real_accelerator import get_accelerator +# if get_accelerator().device_name() == 'cuda': +try: from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm from apex.normalization import MixedFusedRMSNorm as RMSNorm -else: + HAS_APEX = True +except Exception: + HAS_APEX = False from .rmsnorm import RMSNorm from torch.nn import LayerNorm +# else: +# from .rmsnorm import RMSNorm +# from torch.nn import LayerNorm from .distributed import DistributedDataParallel from .bert_model import BertModel diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py index ec2ae1877a..ceef5be725 100644 --- a/megatron/model/language_model.py +++ b/megatron/model/language_model.py @@ -389,10 +389,16 @@ def __init__(self, post_process=True, num_experts=[1]): args = get_args() - # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5. - if args.untie_embeddings_and_output_weights: assert not add_decoder - super(TransformerLanguageModel, self).__init__(share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights) - + # TODO: passing `share_embeddings_and_output_weights=False` + # will not work correctly for T5 and embeddings will not be synced. + # Fix later for T5. + if args.untie_embeddings_and_output_weights: + assert not add_decoder + super(TransformerLanguageModel, self).__init__( + share_embeddings_and_output_weights=( + not args.untie_embeddings_and_output_weights + ) + ) self.pre_process = pre_process self.post_process = post_process self.hidden_size = config.hidden_size @@ -405,27 +411,35 @@ def __init__(self, self.add_pooler = add_pooler self.encoder_hidden_state = None self.add_retriever = args.retro_add_retriever - self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights + self.untie_embeddings_and_output_weights = ( + args.untie_embeddings_and_output_weights + ) self.num_experts = num_experts # Embeddings. if self.pre_process: - self.embedding = Embedding(self.hidden_size, - args.padded_vocab_size, - args.max_position_embeddings, - args.hidden_dropout, - config, - self.num_tokentypes, - args.embedding_weights_in_fp32) + self.embedding = Embedding( + self.hidden_size, + args.padded_vocab_size, + args.max_position_embeddings, + args.hidden_dropout, + config, + self.num_tokentypes, + args.embedding_weights_in_fp32 + ) self._embedding_key = 'embedding' # Rotary positional embeddings - self.use_rotary_position_embeddings = \ - args.use_rotary_position_embeddings + self.use_rotary_position_embeddings = ( + args.use_rotary_position_embeddings + ) if args.use_rotary_position_embeddings: self.seq_length = args.seq_length - rotary_dim = args.hidden_size // args.num_attention_heads \ - if args.kv_channels is None else args.kv_channels + rotary_dim = ( + args.hidden_size // args.num_attention_heads + if args.kv_channels is None + else args.kv_channels + ) if args.rotary_percent < 1.0: rotary_dim = int(rotary_dim * args.rotary_percent) @@ -433,15 +447,22 @@ def __init__(self, # partial rotary embeddings, which is better than full rotary # Wang and Komatsuzaki et al # https://github.com/kingoflolz/mesh-transformer-jax/ - self.rotary_pos_emb = RotaryEmbedding(rotary_dim, theta=args.rope_theta) + self.rotary_pos_emb = RotaryEmbedding( + rotary_dim, + theta=args.rope_theta + ) # Encoder (usually set to True, False if part of an encoder-decoder # architecture and in encoder-only stage). if self.add_encoder: self.encoder = ParallelTransformer( config, - model_type=args.model_type if not args.retro_add_retriever \ - else ModelType.retro_decoder, + # args.model_type if not args.retro_add_retriever + # else ModelType.retro_decoder + model_type=( + ModelType.retro_decoder if args.retro_add_retriever + else args.model_type + ), self_attn_mask_type=self.encoder_attn_mask_type, pre_process=self.pre_process, post_process=self.post_process, @@ -461,7 +482,8 @@ def __init__(self, self_attn_mask_type=self.decoder_attn_mask_type, pre_process=self.pre_process, post_process=self.post_process, - num_experts=self.num_experts) + num_experts=self.num_experts + ) self._decoder_key = 'decoder' else: self.decoder = None @@ -478,24 +500,30 @@ def __init__(self, args.padded_vocab_size, config=config, init_method=self.init_method, - bias=False) # Setting bias to False always to keep it consistent with embedding tying that also does not have a bias. + # Setting bias to False always to keep it consistent with + # embedding tying that also does not have a bias. + bias=False + ) self._output_layer_key = 'output_layer' def set_input_tensor(self, input_tensor): """ See megatron.model.transformer.set_input_tensor()""" - # This is usually handled in schedules.py but some inference code still # gives us non-lists or None if not isinstance(input_tensor, list): input_tensor = [input_tensor] if self.add_encoder and self.add_decoder: - assert len(input_tensor) == 1, \ - 'input_tensor should only be length 1 for stage with both encoder and decoder' + assert len(input_tensor) == 1, ( + 'input_tensor should only be length 1 ' + 'for stage with both encoder and decoder' + ) self.encoder.set_input_tensor(input_tensor[0]) elif self.add_encoder: - assert len(input_tensor) == 1, \ - 'input_tensor should only be length 1 for stage with only encoder' + assert len(input_tensor) == 1, ( + 'input_tensor should only be length 1 ' + 'for stage with only encoder' + ) self.encoder.set_input_tensor(input_tensor[0]) elif self.add_decoder: if len(input_tensor) == 2: @@ -505,32 +533,50 @@ def set_input_tensor(self, input_tensor): self.decoder.set_input_tensor(None) self.encoder_hidden_state = input_tensor[0] else: - raise Exception('input_tensor must have either length 1 or 2') + raise Exception( + 'input_tensor must have either length 1 or 2' + ) else: - raise Exception('Stage must have at least either encoder or decoder') - - def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask, - dec_input_ids=None, dec_position_ids=None, dec_attn_mask=None, - retriever_input_ids=None, - retriever_position_ids=None, - retriever_attn_mask=None, - enc_dec_attn_mask=None, tokentype_ids=None, - inference_params=None, - pooling_sequence_index=0, - enc_hidden_states=None, output_enc_hidden=False): + raise Exception( + 'Stage must have at least either encoder or decoder' + ) + + def forward( + self, + enc_input_ids, + enc_position_ids, + enc_attn_mask, + dec_input_ids=None, + dec_position_ids=None, + dec_attn_mask=None, + retriever_input_ids=None, + retriever_position_ids=None, + retriever_attn_mask=None, + enc_dec_attn_mask=None, + tokentype_ids=None, + inference_params=None, + pooling_sequence_index=0, + enc_hidden_states=None, + output_enc_hidden=False + ): args = get_args() # Encoder embedding. if self.pre_process: - encoder_input = self.embedding(enc_input_ids, enc_position_ids, - tokentype_ids=tokentype_ids) + encoder_input = self.embedding( + enc_input_ids, + enc_position_ids, + tokentype_ids=tokentype_ids + ) else: encoder_input = None # Retriever embedding. if self.add_retriever and self.pre_process: - retriever_input = self.embedding(retriever_input_ids, - retriever_position_ids, - tokentype_ids=tokentype_ids) + retriever_input = self.embedding( + retriever_input_ids, + retriever_position_ids, + tokentype_ids=tokentype_ids + ) else: retriever_input = None diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index 12a458375d..48f2737a06 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -1,13 +1,9 @@ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. from deepspeed.accelerator import get_accelerator -if get_accelerator().device_name() == 'cuda': - from apex.optimizers import FusedAdam as Adam - from apex.optimizers import FusedSGD as SGD -else: - from torch.optim import Adam - from torch.optim import SGD +import torch +from typing import Callable, Any, Iterable, Union from megatron import get_args from .distrib_optimizer import DistributedOptimizer @@ -15,19 +11,60 @@ from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer -def get_param_groups(modules, - no_weight_decay_cond, - scale_lr_cond, - lr_mult): - """creates param groups based on weight decay condition (regularized vs non regularized) - and learning rate scale condition (args.lr vs lr_mult * args.lr) - scale_lr_cond is used during finetuning where head of the network requires a scaled - version of the base learning rate. +import ezpz as ez +RANK = ez.get_rank() + + +def get_param_groups( + modules: Union[torch.nn.Module, Iterable[torch.nn.Module]], + no_weight_decay_cond: Callable[[str, torch.Tensor], bool], + scale_lr_cond: Callable[[str, torch.Tensor], bool], + lr_mult: Any, + use_galore: bool = False, +): + """ + Creates param groups (regularized vs non) based on: + + - weight decay condition. + - learning rate scale condition (args.lr vs lr_mult * args.lr) + - scale_lr_cond is used during finetuning, where head of the network + requires a scaled version of the base learning rate. + # if 'galore' in args.optimizer.lower(): + # # make parameters with "rank" to a single group, if param_name has "mlp" or "attn" + # galore_params = [] + # target_modules_list = ["attn", "mlp"] + # # for module_name, module in param_groups: + # for group_id, group in enumerate(param_groups): + # for param, p in enumerate(group['params']): + # if not isinstance(module, torch.nn.Linear): + # continue + # if not any(target_key in module_name for target_key in target_modules_list): + # continue + # print('enable GaLore for weights in module: ', module_name) + # galore_params.append(module.weight) + # id_galore_params = [id(p) for p in galore_params] + # # make parameters without "rank" to another group + # regular_params = [p for p in param_groups if id(p) not in id_galore_params] + # # then call galore_adamw + # param_groups = [ + # { + # 'params': regular_params + # }, + # { + # 'params': galore_params, + # 'rank': RANK, + # 'update_proj_gap': args.update_proj_gap, + # 'scale': args.galore_scale, + # 'proj_type': args.proj_type + # } + # ] """ wd_no_scale_lr = [] wd_scale_lr = [] no_wd_no_scale_lr = [] no_wd_scale_lr = [] + galore_params = [] + target_modules_list = ["attn", "mlp"] for module in modules: for name, param in module.named_parameters(): if not param.requires_grad: @@ -65,20 +102,30 @@ def get_param_groups(modules, return param_groups -def get_megatron_optimizer(model, - no_weight_decay_cond=None, - scale_lr_cond=None, - lr_mult=1.0): + +def get_megatron_optimizer( + model, + no_weight_decay_cond=None, + scale_lr_cond=None, + lr_mult=1.0 +): args = get_args() + assert args is not None # Base optimizer. - param_groups = get_param_groups(model, - no_weight_decay_cond, - scale_lr_cond, - lr_mult) + param_groups = get_param_groups( + model, + no_weight_decay_cond, + scale_lr_cond, + lr_mult + ) if args.create_moe_param_group: - from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer - param_groups = split_params_into_different_moe_groups_for_optimizer(param_groups) + from deepspeed.moe.utils import ( + split_params_into_different_moe_groups_for_optimizer + ) + param_groups = split_params_into_different_moe_groups_for_optimizer( + param_groups + ) if args.cpu_optimizer: assert args.optimizer == 'adam', 'CPU offloading is for Adam' @@ -87,45 +134,200 @@ def get_megatron_optimizer(model, else: from deepspeed.ops.adam import DeepSpeedCPUAdam cpu_adam_optimizer = DeepSpeedCPUAdam - optimizer = cpu_adam_optimizer(param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - betas=(args.adam_beta1, args.adam_beta2), - eps=args.adam_eps) - else: - if args.optimizer == 'adam': - if args.ds_fused_adam: - global Adam - from deepspeed.ops.adam import FusedAdam - Adam = FusedAdam - optimizer = Adam(param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - betas=(args.adam_beta1, args.adam_beta2), - eps=args.adam_eps) - elif args.optimizer == 'sgd': - optimizer = SGD(param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - momentum=args.sgd_momentum) - else: - raise Exception('{} optimizer is not supported.'.format( - args.optimizer)) + optimizer = cpu_adam_optimizer( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + ) + elif args.optimizer.lower() == "galore_adamw": + from galore_torch import GaLoreAdamW, GaLoreAdamW8bit + # redefine way to call galore_adamw + optimizer = GaLoreAdamW(param_groups, lr=args.lr, weight_decay=args.weight_decay) + elif args.optimizer.lower() == "galore_adamw": + # redefine way to call galore_adamw + optimizer = GaLoreAdamW(param_groups, lr=args.lr, weight_decay=args.weight_decay) + # implement adafactor + elif args.optimizer.lower() == "adafactor": + import transformers + args.beta1 = None if args.beta1 == 0.0 else args.beta1 + optimizer = transformers.optimization.Adafactor( + param_groups, + lr=args.lr, + eps=(1e-30, 1e-3), + clip_threshold=1.0, + decay_rate=-0.8, + beta1=args.beta1, + weight_decay=args.weight_decay, + relative_step=False, + scale_parameter=False, + warmup_init=False, + ) + # low-rank adafactor + elif args.optimizer.lower() == "galore_adafactor": + args.beta1 = None if args.beta1 == 0.0 else args.beta1 + optimizer = GaLoreAdafactor( + param_groups, + lr=args.lr, + eps=(1e-30, 1e-3), + clip_threshold=1.0, + decay_rate=-0.8, + beta1=args.beta1, + weight_decay=args.weight_decay, + relative_step=False, + scale_parameter=False, + warmup_init=False, + ) + # 8-bit Adam + elif args.optimizer.lower() == "adam8bit": + import bitsandbytes as bnb + optimizer = bnb.optim.Adam8bit(param_groups, lr=args.lr, weight_decay=args.weight_decay) + elif args.optimizer.lower() == "galore_adamw8bit": + optimizer = GaLoreAdamW8bit(param_groups, lr=args.lr, weight_decay=args.weight_decay) + elif args.optimizer.lower() == 'galore_adamw8bit_per_layer': + # TODO: seems scheduler call twice in one update step, need to check, for now double the num_training_steps, warmup_steps and update_proj_gap + optimizer_dict = {} + for p in model.parameters(): + if p.requires_grad: + if id(p) in id_galore_params: + optimizer_dict[p] = GaLoreAdamW8bit([{'params': [p], 'rank': args.rank, 'update_proj_gap': args.update_proj_gap * 2, 'scale': args.galore_scale, 'proj_type': args.proj_type}], lr=args.lr, weight_decay=args.weight_decay) + else: + optimizer_dict[p] = bnb.optim.Adam8bit([p], lr=args.lr, weight_decay=args.weight_decay) + # get scheduler dict + scheduler_dict = {} + from galore_torch.peft_pretraining import training_utils + for p in model.parameters(): + if p.requires_grad: + scheduler_dict[p] = training_utils.get_scheculer( + optimizer=optimizer_dict[p], + scheduler_type=args.scheduler, + num_training_steps=args.num_training_steps * 2, + warmup_steps=args.warmup_steps * 2, + min_lr_ratio=args.min_lr_ratio, + ) + + def optimizer_hook(p): + if p.grad is None: + return + optimizer_dict[p].step() + optimizer_dict[p].zero_grad() + scheduler_dict[p].step() + # Register the hook onto every parameter + for p in model.parameters(): + if p.requires_grad: + p.register_post_accumulate_grad_hook(optimizer_hook) + layer_wise_flag = True + elif str(args.optimizer) == 'ipex.lamb': + from intel_extension_for_pytorch.optim._lamb import Lamb + optimizer = Lamb( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + ) + elif str(args.optimizer) == 'ipex.fusedlamb': + from intel_extension_for_pytorch.optim._lamb import Lamb + optimizer = Lamb( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + fused=True, + ) + elif str(args.optimizer).lower() == 'ds.fusedlamb': + from deepspeed.ops.lamb import FusedLamb + optimizer = FusedLamb( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + ) + elif str(args.optimizer).lower() == 'adamwschedulefree': + import schedulefree + optimizer = schedulefree.AdamWScheduleFree( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + warmup_steps=args.lr_warmup_iters, + foreach=args.schedulefree_for_each, + ) + elif str(args.optimizer).lower() == 'sgdschedulefree': + import schedulefree + optimizer = schedulefree.SGDScheduleFree( + param_groups, + lr=args.lr, + momentum=args.sgd_momentum, + weight_decay=args.weight_decay, + warmup_steps=args.lr_warmup_iters, + foreach=args.schedulefree_for_each, + ) + elif str(args.optimizer).lower() == 'apex.adam': + assert get_accelerator().device_name() == 'cuda' + from apex.optimizers import FusedAdam as Adam + optimizer = Adam( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps + ) + elif str(args.optimizer).lower() == 'apex.sgd': + from apex.optimizers import FusedSGD as SGD + optimizer = SGD( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + momentum=args.sgd_momentum + ) + elif str(args.optimizer).lower() == 'adamw': + optimizer = torch.optim.AdamW( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps + ) + elif args.optimizer == 'adam': + if args.ds_fused_adam: + # global Adam + from deepspeed.ops.adam import FusedAdam + Adam = FusedAdam + else: + Adam = torch.optim.Adam + optimizer = Adam( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps + ) + elif args.optimizer == 'sgd': + optimizer = torch.optim.SGD( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + momentum=args.sgd_momentum + ) + else: + raise TypeError(f'{args.optimizer} optimizer is not supported.') if args.deepspeed: return optimizer - # Determine whether the params have main-grad field. params_have_main_grad = False if args.use_contiguous_buffers_in_local_ddp: params_have_main_grad = True - # Mixed precision optimizer. # - Note: both the Float16Optimizer and the DistributedOptimizer inherit # from the MixedPrecisionOptimizer, which manages any optimizer where # the model params and main params are distinct. if args.fp16 or args.bf16 or args.use_distributed_optimizer: - # Grad scaler: # if loss-scale is provided, instantiate the constant scaler. # if we are using fp16 and loss-scale is not present, use a @@ -133,11 +335,9 @@ def get_megatron_optimizer(model, # otherwise we are running in bf16 with no loss-scale so # leave it as None. grad_scaler = None - # Constant loss scale. if args.loss_scale: grad_scaler = ConstantGradScaler(args.loss_scale) - # Dynamic loss scale. else: if args.fp16: @@ -148,11 +348,11 @@ def get_megatron_optimizer(model, backoff_factor=0.5, growth_interval=args.loss_scale_window, hysteresis=args.hysteresis) - # Megatron optimizer. - opt_ty = DistributedOptimizer \ - if args.use_distributed_optimizer else \ - Float16OptimizerWithFloat16Params + opt_ty = ( + DistributedOptimizer if args.use_distributed_optimizer + else Float16OptimizerWithFloat16Params + ) return opt_ty(optimizer, args.clip_grad, args.log_num_zeros_in_grad, @@ -163,10 +363,12 @@ def get_megatron_optimizer(model, args.params_dtype, grad_scaler, model) - # FP32. - return FP32Optimizer(optimizer, args.clip_grad, - args.log_num_zeros_in_grad, - params_have_main_grad, - args.use_contiguous_buffers_in_local_ddp, - model) + return FP32Optimizer( + optimizer, + args.clip_grad, + args.log_num_zeros_in_grad, + params_have_main_grad, + args.use_contiguous_buffers_in_local_ddp, + model + ) diff --git a/megatron/optimizer/clip_grads.py b/megatron/optimizer/clip_grads.py index afec8f220c..b5141d0059 100644 --- a/megatron/optimizer/clip_grads.py +++ b/megatron/optimizer/clip_grads.py @@ -8,10 +8,14 @@ except ModuleNotFoundError: from torch import inf as inf -from deepspeed.accelerator import get_accelerator -if get_accelerator().device_name() == 'cuda': +# from deepspeed.accelerator import get_accelerator +# if get_accelerator().device_name() == 'cuda': +try: from apex.multi_tensor_apply import multi_tensor_applier import amp_C + HAS_APEX = True +except Exception: + HAS_APEX = False from megatron.model.module import param_is_not_shared from megatron.core.tensor_parallel import param_is_not_tensor_parallel_duplicate @@ -71,7 +75,7 @@ def clip_grad_norm_fp32(parameters, grads_for_norm, else: if norm_type == 2.0: - if get_accelerator().device_name() == 'cuda': + if get_accelerator().device_name() == 'cuda' and HAS_APEX: dummy_overflow_buf = torch.cuda.IntTensor([0]) # Use apex's multi-tensor applier for efficiency reasons. # Multi-tensor applier takes a function and a list of list diff --git a/megatron/optimizer/distrib_optimizer.py b/megatron/optimizer/distrib_optimizer.py index 1aeeac3444..10331607d9 100644 --- a/megatron/optimizer/distrib_optimizer.py +++ b/megatron/optimizer/distrib_optimizer.py @@ -2,10 +2,11 @@ """Megatron distributed optimizer.""" -from deepspeed.accelerator import get_accelerator -if get_accelerator().device_name() == 'cuda': +# from deepspeed.accelerator import get_accelerator +# if get_accelerator().device_name() == 'cuda': +try: from apex.optimizers import FusedAdam as Adam -else: +except Exception: from torch.optim import Adam import math diff --git a/megatron/timers.py b/megatron/timers.py index 384c7c37a3..870ba8996f 100644 --- a/megatron/timers.py +++ b/megatron/timers.py @@ -8,8 +8,14 @@ import torch from deepspeed.accelerator import get_accelerator +from tensorboard.summary import Writer from packaging import version +try: + import wandb +except Exception: + wandb = None + class TimerBase(ABC): @@ -292,8 +298,15 @@ def log(self, names, rank=None, normalizer=1.0, reset=True, barrier=False): print(output_string, flush=True) - def write(self, names, writer, iteration, normalizer=1.0, - reset=False, barrier=False): + def write( + self, + names: list[str], + writer: Writer, + iteration: int, + normalizer: float = 1.0, + reset: bool = False, + barrier: bool = False + ): """Write timers to a tensorboard writer Note that we only report maximum time across ranks to tensorboard. """ @@ -303,7 +316,16 @@ def write(self, names, writer, iteration, normalizer=1.0, assert normalizer > 0.0 name_to_min_max_time = self._get_global_min_max_time( names, reset, barrier, normalizer) + timer_data = { + 'timers/iteration': iteration, + **{ + f'timers/{k}-time': name_to_min_max_time[k][1] + for k in name_to_min_max_time + } + } + if wandb is not None and getattr(wandb, 'run', None) is not None: + wandb.log(timer_data, commit=False) if writer is not None: for name in name_to_min_max_time: _, max_time = name_to_min_max_time[name] - writer.add_scalar(name + '-time', max_time, iteration) + writer.add_scalar(f'{name}-time', max_time, iteration) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 43c251bab1..023c2f756c 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -8,6 +8,8 @@ from transformers import AutoTokenizer from .bert_tokenization import FullTokenizer as FullBertTokenizer from .gpt2_tokenization import GPT2Tokenizer + + def build_tokenizer(args): """Initialize tokenizer.""" if args.rank == 0: @@ -35,12 +37,15 @@ def build_tokenizer(args): elif args.tokenizer_type == 'GPTSentencePieceTokenizer': assert args.tokenizer_model is not None tokenizer = _GPTSentencePieceTokenizer(args.tokenizer_model) + elif args.tokenizer_type == 'Llama2Tokenizer': + assert args.tokenizer_model is not None + tokenizer = _Llama2Tokenizer(args.tokenizer_model) elif args.tokenizer_type == 'NullTokenizer': assert args.vocab_size is not None tokenizer = _NullTokenizer(args.vocab_size) elif args.tokenizer_type == 'HFTokenizer': assert args.tokenizer_model is not None - tokenizer = _HFTokenizer(args.tokenizer_model,args.seq_length) + tokenizer = _HFTokenizer(args.tokenizer_model) else: raise NotImplementedError('{} tokenizer is not ' 'implemented.'.format(args.tokenizer_type)) @@ -504,6 +509,56 @@ def eod(self): def additional_special_tokens_ids(self): return None + +class _Llama2Tokenizer(_SentencePieceTokenizer): + """SentencePieceTokenizer-Megatron wrapper""" + + def __init__(self, model_file,): + super().__init__(model_file, vocab_extra_ids=0) + + def _initalize(self, vocab_extra_ids): + self._populate_vocab() + + # BOS / EOS token IDs + self.n_words: int = self.tokenizer.vocab_size() + self.bos_id: int = self.tokenizer.bos_id() + self.eos_id: int = self.tokenizer.eos_id() + self.pad_id: int = self.tokenizer.pad_id() + assert self.tokenizer.vocab_size() == self.tokenizer.get_piece_size() + + def tokenize(self, s: str, bos=True, eos=False): + '''Default args for text completion, not chat/dialog.''' + assert type(s) is str + t = self.tokenizer.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def detokenize(self, ids): + return self.tokenizer.decode_ids(ids) + + @property + def cls(self): + return -1 + + @property + def sep(self): + return -1 + + @property + def mask(self): + return -1 + + @property + def eod(self): + return self.eos_id + + @property + def additional_special_tokens_ids(self): + return None + class _NullTokenizer: def __init__(self, vocab_size): vocab_size = int(vocab_size) @@ -540,28 +595,10 @@ def additional_special_tokens_ids(self): class _HFTokenizer(AbstractTokenizer): """HF Tokenizer""" - def __init__(self, tokenizer_name_or_path,max_seq_len): + def __init__(self, tokenizer_name_or_path): name = tokenizer_name_or_path super().__init__(name) - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path,padding_side="right",use_fast=False) - - DEFAULT_PAD_TOKEN = "[PAD]" - DEFAULT_EOS_TOKEN = "" - DEFAULT_BOS_TOKEN = "" - DEFAULT_UNK_TOKEN = "" - special_tokens_dict = dict() - if self.tokenizer.pad_token is None: - special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN - if self.tokenizer.eos_token is None: - special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN - if self.tokenizer.bos_token is None: - special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN - if self.tokenizer.unk_token is None: - special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN - self.tokenizer.add_special_tokens(special_tokens_dict) - # if self.tokenizer.pad_token == None: - # self.tokenizer.pad_token= "[PAD]" - self.tokenizer.model_max_length = max_seq_len + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) self.encoder = self.tokenizer.get_vocab() self.decoder = {v: k for k, v in self.encoder.items()} diff --git a/megatron/training.py b/megatron/training.py index 19b8a6c71f..035912bb74 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -10,6 +10,7 @@ # The earliest we can measure the start time. _TRAIN_START_TIME = time.time() import torch +import torch.distributed as tdist from collections import OrderedDict from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP @@ -49,9 +50,16 @@ from deepspeed.compression.compress import init_compression, redundancy_clean from deepspeed.runtime.data_pipeline.data_routing.helper import convert_to_random_ltd from megatron.model.transformer import ParallelTransformerLayer +import ezpz as ez +import logging from deepspeed import comm as dist +RANK = ez.get_rank() +WORLD_SIZE = ez.get_world_size() +log = logging.getLogger(__name__) +log.setLevel("INFO") if RANK == 0 else log.setLevel("CRITICAL") + try: import wandb except (ImportError, ModuleNotFoundError): @@ -60,9 +68,40 @@ def print_datetime(string): """Note that this call will sync across all ranks.""" - torch.distributed.barrier() + tdist.barrier() time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - print_rank_0('[' + string + '] datetime: {} '.format(time_str)) + log.info('[' + string + '] datetime={} '.format(time_str)) + + +def num_floating_point_operations(args, batch_size): + # Group Query Attention. + # if not args.group_query_attention: + if not args.num_key_value_heads: + args.num_key_value_heads = args.num_attention_heads + # args.num_query_groups = args.num_attention_heads + # MoE. + # num_experts_routed_to = 1 if args.num_experts is None else args.moe_router_topk + num_experts_routed_to = 1 if args.num_experts is None else args.topk + gated_linear_multiplier = 3 / 2 if args.swiglu else 1 + return ( + 12 + * batch_size + * args.seq_length + * args.num_layers + * args.hidden_size + * args.hidden_size + * ( + 1 + + ( + (args.ffn_hidden_size / args.hidden_size) + * num_experts_routed_to + * gated_linear_multiplier + ) + + (args.num_key_value_heads / args.num_attention_heads) + + (args.seq_length / args.hidden_size) + + (args.padded_vocab_size / (2 * args.num_layers * args.hidden_size)) + ) + ) ''' Since v0.9.0, deepspeed.initialize() has forbidden simultaneous setting of args.deepspeed_config (Path) and ds_config dict. @@ -70,30 +109,30 @@ def print_datetime(string): ''' def _create_ds_config_dict(): args = get_args() + assert args is not None if isinstance(args.deepspeed_config, dict) : ds_config_dict = args.deepspeed_config else: with open(args.deepspeed_config, 'r', encoding='utf-8') as config_file: ds_config_dict = json.load(config_file) - if args.universal_checkpoint: ds_config_dict["checkpoint"] = {"load_universal": True} - # Clear config path - args.deepspeed_config = None - + args.deepspeed_config = None return ds_config_dict - -def pretrain(train_valid_test_dataset_provider, - model_provider, - model_type, - forward_step_func, - process_non_loss_data_func=None, - extra_args_provider=None, - args_defaults={}, - data_post_process=None, - external_args={}): + +def pretrain( + train_valid_test_dataset_provider, + model_provider, + model_type, + forward_step_func, + process_non_loss_data_func=None, + extra_args_provider=None, + args_defaults={}, + data_post_process=None, + external_args={}, +) -> torch.nn.Module: """Main training program. This function will run the followings in the order provided: @@ -121,6 +160,9 @@ def pretrain(train_valid_test_dataset_provider, to it. It is used for programs to add their own arguments. args_defaults: a dictionary from argument-name to argument-value. It to set already parse arguments. + + Returns: + model (torch.nn.Module) """ # Initalize and get arguments, timers, and Tensorboard writer. @@ -135,15 +177,18 @@ def pretrain(train_valid_test_dataset_provider, # image ... launches. global _TRAIN_START_TIME start_time_tensor = get_accelerator().DoubleTensor([_TRAIN_START_TIME]) - torch.distributed.all_reduce(start_time_tensor, - op=torch.distributed.ReduceOp.MIN) + tdist.all_reduce(start_time_tensor, op=tdist.ReduceOp.MIN) + # torch.distributed.all_reduce(start_time_tensor, + # op=torch.distributed.ReduceOp.MIN) _TRAIN_START_TIME = start_time_tensor.item() - print_rank_0('time to initialize megatron (seconds): {:.3f}'.format( + log.info('time to initialize megatron (seconds)={:.3f}'.format( time.time() - _TRAIN_START_TIME)) print_datetime('after megatron is initialized') args = get_args() timers = get_timers() + assert args is not None + assert timers is not None if args.deepspeed: args.deepspeed_config_dict = _create_ds_config_dict() @@ -211,16 +256,16 @@ def pretrain(train_valid_test_dataset_provider, args.teacher_model = setup_teacher_model(args, model_provider) # Print setup timing. - print_rank_0('done with setup ...') + log.info('done with setup ...') timers.log(['model-and-optimizer-setup', 'train/valid/test-data-iterators-setup'], barrier=True) if not args.skip_train: - print_rank_0('training ...') + log.info('training ...') if args.dataloader_type == 'cyclic' and args.retro_add_retriever: args.train_iters = args.retro_cyclic_train_iters - print_rank_0("retro cyclic train iters : %d" % args.train_iters) + log.info("retro cyclic train iters : %d" % args.train_iters) iteration = 0 if args.do_train and args.train_iters > 0: @@ -237,7 +282,7 @@ def pretrain(train_valid_test_dataset_provider, if args.save and iteration != 0: save_checkpoint(iteration, model, optimizer, opt_param_scheduler) else: - print_rank_0('skipping training (--skip-train is on) ...') + log.info('skipping training (--skip-train is on) ...') iteration = args.iteration @@ -285,12 +330,12 @@ def update_train_iters(args): args.global_batch_size args.train_iters = iterations - print_rank_0('setting training iterations to {}'.format(args.train_iters)) + log.info('setting training iterations to {}'.format(args.train_iters)) def setup_teacher_model(args, model_provider): - print_rank_0('***>>>>> Student model checkpoint iteration:{}'.format(args.iteration)) + log.info('***>>>>> Student model checkpoint iteration:{}'.format(args.iteration)) iteration_stuent = args.iteration num_layers_student = args.num_layers num_experts_student = args.num_experts @@ -298,7 +343,7 @@ def setup_teacher_model(args, model_provider): num_attention_heads_student = args.num_attention_heads load_student = args.load - print_rank_0('***>>>>> Setting up the teacher model') + log.info('***>>>>> Setting up the teacher model') args.num_layers = args.num_layers_teacher args.num_experts = args.num_experts_teacher @@ -306,7 +351,7 @@ def setup_teacher_model(args, model_provider): args.num_attention_heads = args.num_attention_heads_teacher args.load = args.load_teacher teacher_model, _, _ = load_model_weights_only(model_provider) - print_rank_0('***>>>>> Teacher model:{}'.format(teacher_model)) + log.info('***>>>>> Teacher model:{}'.format(teacher_model)) args.num_layers = num_layers_student args.num_experts = num_experts_student @@ -320,6 +365,7 @@ def setup_teacher_model(args, model_provider): def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True): """Build the model.""" args = get_args() + assert args is not None args.model_type = model_type # Build model. @@ -389,7 +435,7 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap # Print number of parameters. if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on (tensor, pipeline) ' - 'model parallel rank ({}, {}): {}'.format( + 'model parallel rank ({}, {})={}'.format( mpu.get_tensor_model_parallel_rank(), mpu.get_pipeline_model_parallel_rank(), sum([sum([p.ds_numel if hasattr(p,'ds_id') else p.nelement() for p in model_module.parameters()]) @@ -481,7 +527,7 @@ def get_optimizer_param_scheduler(optimizer): def load_model_weights_only(model_provider_func): """Setup model and optimizer.""" args = get_args() - print_rank_0('***>>>>> Args:{}'.format(args)) + log.info('***>>>>> Args:{}'.format(args)) model = get_model(model_provider_func) @@ -540,7 +586,7 @@ def setup_model_and_optimizer(model_provider_func, else: args.iteration = 0 student_global_steps = model[0].global_steps - print_rank_0('***>>>>> Student model, global step:{}'.format(student_global_steps)) + log.info('***>>>>> Student model, global step:{}'.format(student_global_steps)) if args.compression_training: model, _, _, _ = deepspeed.initialize( @@ -568,7 +614,7 @@ def setup_model_and_optimizer(model_provider_func, opt_param_scheduler = get_optimizer_param_scheduler(optimizer) if args.deepspeed: - print_rank_0("DeepSpeed is enabled.") + log.info("DeepSpeed is enabled.") pp = mpu.get_pipeline_model_parallel_world_size() if args.data_efficiency_curriculum_learning and build_train_valid_test_datasets_provider is not None: train_ds = None @@ -644,7 +690,7 @@ def setup_model_and_optimizer(model_provider_func, # get model without FP16 and/or TorchDDP wrappers if args.iteration == 0 and len(unwrapped_model) == 1 \ and hasattr(unwrapped_model[0], 'init_state_dict_from_bert'): - print_rank_0("Initializing ICT from pretrained BERT model") + log.info("Initializing ICT from pretrained BERT model") unwrapped_model[0].init_state_dict_from_bert() if args.fp16: optimizer.reload_model_params() @@ -734,8 +780,11 @@ def train_step(forward_step_func, data_iterator, increment = get_num_microbatches() * \ args.micro_batch_size * \ args.data_parallel_size - model[0].step(lr_kwargs={'increment': increment}) - update_successful = model[0].was_step_applied() + try: + model[0].step(lr_kwargs={'increment': increment}) + update_successful = model[0].was_step_applied() + except Exception: + update_successful = False else: update_successful, grad_norm, num_zeros_in_grad = optimizer.step(args, timers) timers('optimizer').stop() @@ -755,7 +804,6 @@ def train_step(forward_step_func, data_iterator, skipped_iter = 0 grad_norm = None num_zeros_in_grad = None - loss_reduced = {} for key in losses_reduced[0]: losses_reduced_for_key = [x[key] for x in losses_reduced] @@ -793,7 +841,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, args = get_args() timers = get_timers() writer = get_tensorboard_writer() - + wandb_metrics = {} # Advanced, skipped, and Nan iterations. advanced_iters_key = 'advanced iterations' skipped_iters_key = 'skipped iterations' @@ -852,11 +900,15 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, 'optimizer'] # Calculate batch size. - batch_size = args.micro_batch_size * args.data_parallel_size * \ - get_num_microbatches() - - total_iterations = total_loss_dict[advanced_iters_key] + \ - total_loss_dict[skipped_iters_key] + batch_size = ( + args.micro_batch_size + * args.data_parallel_size + * get_num_microbatches() + ) + total_iterations = ( + total_loss_dict[advanced_iters_key] + + total_loss_dict[skipped_iters_key] + ) # Tensorboard values. # Timer requires all the ranks to call. @@ -870,6 +922,10 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, writer.add_scalar('steps-vs-tokens/y=steps,x=tokens', iteration, args.consumed_train_tokens) writer.add_scalar('steps-vs-tokens/y=tokens,x=steps', args.consumed_train_tokens, iteration) if args.log_learning_rate_to_tensorboard: + wandb_metrics |= { + 'learning-rate/iteration': iteration, + 'learning-rate/learning-rate': learning_rate, + } writer.add_scalar('learning-rate/learning-rate', learning_rate, iteration) writer.add_scalar('learning-rate/learning-rate vs samples', learning_rate, args.consumed_train_samples) @@ -881,7 +937,12 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, args.consumed_train_samples) writer.add_scalar('batch-size/batch-size vs tokens', batch_size, args.consumed_train_tokens) + wandb_metrics |= { + 'lm-loss-training/iteration': iteration, + 'lm-loss-training/consumed_train_tokens': args.consumed_train_tokens, + } for key in loss_dict: + wandb_metrics |= {f'lm-loss-training/{key}': loss_dict[key]} writer.add_scalar(f"lm-loss-training/{key}", loss_dict[key], iteration) writer.add_scalar(f"lm-loss-training/{key}" + ' vs samples', loss_dict[key], args.consumed_train_samples) @@ -900,18 +961,21 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, writer.add_scalar('world-size/world-size vs tokens', args.world_size, args.consumed_train_tokens) if grad_norm is not None: + wandb_metrics |= {'training/grad-norm': grad_norm} writer.add_scalar('grad-norm/grad-norm', grad_norm, iteration) writer.add_scalar('grad-norm/grad-norm vs samples', grad_norm, args.consumed_train_samples) writer.add_scalar('grad-norm/grad-norm vs tokens', grad_norm, args.consumed_train_tokens) if num_zeros_in_grad is not None: + wandb_metrics |= {'training/num-zeros': num_zeros_in_grad} writer.add_scalar('num-zeros/num-zeros', num_zeros_in_grad, iteration) writer.add_scalar('num-zeros/num-zeros vs samples', num_zeros_in_grad, args.consumed_train_samples) writer.add_scalar('num-zeros/num-zeros vs tokens', num_zeros_in_grad, args.consumed_train_tokens) if params_norm is not None: + wandb_metrics |= {'training/params-norm': params_norm} writer.add_scalar('params-norm/params-norm', params_norm, iteration) writer.add_scalar('params-norm/params-norm vs samples', params_norm, args.consumed_train_samples) @@ -955,7 +1019,6 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, mem_stats["allocation.all.current"], iteration, ) - if iteration % args.tensorboard_log_interval == 0: # This logging write various optimizer states to tensorboard. This # feature may consume extra GPU memory thus is set at false by default. @@ -964,41 +1027,84 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, opt_stats_2 = [0.0] * 4 for _, group in enumerate(optimizer.param_groups): for _, param in enumerate(group['params']): - opt_stats[0] += (torch.norm(optimizer.state[param]['exp_avg_sq']).item())**2 - opt_stats[1] += (torch.norm(optimizer.state[param]['exp_avg_sq'].sqrt()).item())**2 - opt_stats[2] += (torch.norm(optimizer.state[param]['exp_avg']).item())**2 - opt_stats[3] += (torch.norm(param).item())**2 - opt_stats[4] += torch.norm(optimizer.state[param]['exp_avg_sq'],p=1).item() - opt_stats[5] += torch.norm(optimizer.state[param]['exp_avg_sq'].sqrt(),p=1).item() - opt_stats[6] += torch.norm(optimizer.state[param]['exp_avg'],p=1).item() - opt_stats[7] += torch.norm(param,p=1).item() - opt_stats_2[0] = max(opt_stats_2[0], abs(optimizer.state[param]['exp_avg_sq'].max().item()), abs(optimizer.state[param]['exp_avg_sq'].min().item())) - opt_stats_2[1] = max(opt_stats_2[1], optimizer.state[param]['exp_avg_sq'].sqrt().abs_().max().item()) - opt_stats_2[2] = max(opt_stats_2[2], abs(optimizer.state[param]['exp_avg'].max().item()), abs(optimizer.state[param]['exp_avg'].min().item())) - opt_stats_2[3] = max(opt_stats_2[3], abs(param.max().item()), abs(param.min().item())) + state_param = getattr(optimizer, 'state', None) + if state_param is not None: + exp_avg_sq = state_param.get('exp_avg_sq', torch.tensor(0.)) + exp_avg = state_param.get('exp_avg', torch.tensor(0.)) + opt_stats[0] += (torch.norm(exp_avg_sq).item()) ** 2 + opt_stats[1] += (torch.norm(exp_avg_sq.sqrt()).item()) ** 2 + opt_stats[2] += (torch.norm(exp_avg).item()) ** 2 + opt_stats[3] += (torch.norm(param).item()) ** 2 + opt_stats[4] += torch.norm(exp_avg_sq, p=1).item() + opt_stats[5] += torch.norm(exp_avg_sq.sqrt(), p=1).item() + opt_stats[6] += torch.norm(exp_avg, p=1).item() + opt_stats[7] += torch.norm(param, p=1).item() + opt_stats_2[0] = max( + opt_stats_2[0], + abs(exp_avg_sq.max().item()), + abs(exp_avg_sq.min().item()) + ) + opt_stats_2[1] = max( + opt_stats_2[1], + exp_avg_sq.sqrt().abs_().max().item() + ) + opt_stats_2[2] = max( + opt_stats_2[2], + abs(exp_avg.max().item()), + abs(exp_avg.min().item()) + ) + opt_stats_2[3] = max( + opt_stats_2[3], + abs(param.max().item()), + abs(param.min().item()) + ) # print('step {} rank {} before sync opt_stats {}, {}'.format(iteration, torch.distributed.get_rank(), opt_stats_2, opt_stats)) if args.zero_stage > 0: # ZeRO partiions optimizer states + # opt_stats = opt_stats.clone().detach() + # opt_stats = get_accelerator().FloatTensor opt_stats = get_accelerator().FloatTensor(opt_stats) torch.distributed.all_reduce(opt_stats, group=mpu.get_sequence_data_parallel_group()) + # opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) + # opt_stats_2 = opt_stats_2.clone().detach() opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX, group=mpu.get_sequence_data_parallel_group()) if args.tensor_model_parallel_size > 1: + # opt_stats = opt_stats.clone().detach() opt_stats = get_accelerator().FloatTensor(opt_stats) torch.distributed.all_reduce(opt_stats, group=mpu.get_tensor_model_parallel_group()) + # opt_stats_2 = opt_stats_2.clone().detach() opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX, group=mpu.get_tensor_model_parallel_group()) if args.pipeline_model_parallel_size > 1: + # opt_stats = opt_stats.clone().detach() opt_stats = get_accelerator().FloatTensor(opt_stats) torch.distributed.all_reduce(opt_stats, group=mpu.get_pipeline_model_parallel_group()) + # opt_stats_2 = opt_stats_2.clone().detach() opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX, group=mpu.get_pipeline_model_parallel_group()) - + wandb_metrics |= { + 'optimizer/learning_rate': learning_rate, + 'optimizer/iteration': args.iteration, + 'optimizer/consumed_train_tokens': args.consumed_train_tokens, + 'optimizer/variance_l2': opt_stats[0]**0.5, + 'optimizer/variance_sqrt_l2': opt_stats[1]**0.5, + 'optimizer/momentum_l2': opt_stats[2]**0.5, + 'optimizer/weight_l2': opt_stats[3]**0.5, + 'optimizer/variance_l1': opt_stats[4], + 'optimizer/variance_sqrt_l1': opt_stats[5], + 'optimizer/momentum_l1': opt_stats[6], + 'optimizer/weight_l1': opt_stats[7], + 'optimizer/variance_abs_max': opt_stats_2[0], + 'optimizer/variance_sqrt_abs_max': opt_stats_2[1], + 'optimizer/momentum_abs_max': opt_stats_2[2], + 'optimizer/weight_abs_max': opt_stats_2[3], + } # print('step {} rank {} after sync opt_stats {}, {}'.format(iteration, torch.distributed.get_rank(), opt_stats_2, opt_stats)) if writer and is_last_rank(): writer.add_scalar('optimizer/variance_l2 vs tokens', opt_stats[0]**0.5, args.consumed_train_tokens) @@ -1027,6 +1133,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, writer.add_scalar('optimizer/momentum_abs_max', opt_stats_2[2], iteration) writer.add_scalar('optimizer/weight_abs_max', opt_stats_2[3], iteration) + assert args is not None if iteration % args.log_interval == 0: elapsed_time = timers('interval-time').elapsed(barrier=True) elapsed_time_per_iteration = elapsed_time / total_iterations @@ -1039,103 +1146,119 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, elapsed_time, total_iterations ) + num_flops = num_floating_point_operations(args, batch_size) + # throughput = ( + # num_floating_point_operations_so_far - arg + # ) samples_per_sec_per_replica = samples_per_sec / args.data_parallel_size tokens_per_sec = samples_per_sec * seq_len tokens_per_sec_per_replica = tokens_per_sec / args.data_parallel_size tokens_per_gpu_per_second = tokens_per_sec / args.world_size tokens_per_gpu_per_second_per_replica = tokens_per_gpu_per_second / args.data_parallel_size - if wandb is not None and getattr(wandb, 'run', None) is not None: - assert wandb.run is not None - wandb_metrics = { - 'throughput/iteration-time': elapsed_time_per_iteration, # 1000 ms / s - 'throughput/samples_per_sec': samples_per_sec, - 'throughput/samples_per_sec_per_replica': samples_per_sec_per_replica, - 'throughput/tokens_per_sec': tokens_per_sec, - 'throughput/tokens_per_sec_per_replica': tokens_per_sec_per_replica, - 'throughput/tokens_per_gpu_per_sec': tokens_per_gpu_per_second, - 'throughput/tokens_per_gpu_per_sec_per_replica': tokens_per_gpu_per_second_per_replica, - 'throughput/tflops': tflops, - 'throughput/approx_params_in_billions': approx_parameters_in_billions, - 'throughput/elapsed_ms_per_iteration': elapsed_time_per_iteration, - 'throughput/iteration': iteration, - } - if loss_dict is not None: - wandb_metrics |= { - f'loss/{k}': v for k, v in loss_dict.items() - } - wandb_metrics |= {'loss/iteration': iteration} - if writer: - if args.log_timers_to_tensorboard: - writer.add_scalar('iteration-time/iteration-time', - elapsed_time_per_iteration, iteration) - writer.add_scalar('iteration-time/iteration-time vs samples', - elapsed_time_per_iteration, args.consumed_train_samples) - writer.add_scalar('iteration-time/iteration-time vs tokens', - elapsed_time_per_iteration, args.consumed_train_tokens) - log_string = ' iteration {:8d}/{:8d} |'.format( - iteration, args.train_iters) - log_string += ' consumed samples: {:12d} |'.format( - args.consumed_train_samples) - log_string += ' consumed tokens: {:12d} |'.format( - args.consumed_train_tokens) - log_string += ' elapsed time per iteration (ms): {:.1f} |'.format( - elapsed_time_per_iteration * 1000.0) - log_string += ' learning rate: {:.3E} |'.format(learning_rate) - log_string += ' global batch size: {:5d} |'.format(batch_size) - if wandb is not None and getattr(wandb, 'run', None) is not None: + wandb_metrics |= { + 'throughput/iteration-time': elapsed_time_per_iteration, # 1000 ms / s + 'throughput/samples_per_sec': samples_per_sec, + 'throughput/samples_per_sec_per_replica': samples_per_sec_per_replica, + 'throughput/tokens_per_sec': tokens_per_sec, + 'throughput/tokens_per_sec_per_replica': tokens_per_sec_per_replica, + 'throughput/tokens_per_gpu_per_sec': tokens_per_gpu_per_second, + 'throughput/tokens_per_gpu_per_sec_per_replica': tokens_per_gpu_per_second_per_replica, + 'throughput/tflops': tflops, + 'throughput/flops': num_flops, + 'throughput/tflops-new': num_flops / elapsed_time_per_iteration, + 'throughput/approx_params_in_billions': approx_parameters_in_billions, + 'throughput/elapsed_ms_per_iteration': elapsed_time_per_iteration, + 'throughput/iteration': iteration, + } + if loss_dict is not None: wandb_metrics |= { - 'training/iteration': iteration, - 'training/iteration_time': elapsed_time_per_iteration, - 'training/iteration_time_vs_tokens': ( - (elapsed_time_per_iteration - / args.consumed_train_tokens) - ), - 'training/iteration_time_vs_samples': ( - (elapsed_time_per_iteration - / args.consumed_train_samples), - ), - 'training/consumed_samples': args.consumed_train_samples, - 'training/consumed_tokens': args.consumed_train_tokens, + 'loss/iteration': iteration, + **{f'loss/{k}': v for k, v in loss_dict.items()} } + if writer and args.log_timers_to_tensorboard: + writer.add_scalar('iteration-time/iteration-time', + elapsed_time_per_iteration, iteration) + writer.add_scalar('iteration-time/iteration-time vs samples', + elapsed_time_per_iteration, args.consumed_train_samples) + writer.add_scalar('iteration-time/iteration-time vs tokens', + elapsed_time_per_iteration, args.consumed_train_tokens) + log_string = f' iteration={iteration:8d}/{args.train_iters:8d} |' + # .format( iteration, args.train_iters) + log_string += ( + f' consumed_samples={args.consumed_train_samples:12d} |' + # .format(args.consumed_train_samples) + ) + log_string += f' consumed_tokens={args.consumed_train_tokens:12d} |' + # .format( args.consumed_train_tokens) + log_string += ( + ' elapsed_time_per_iteration_ms=' + f'{elapsed_time_per_iteration * 1000.0:.1f} |' + # .format( elapsed_time_per_iteration * 1000.0) + ) + log_string += f' learning_rate={learning_rate:.6f} |' + log_string += f' global_batch_size={batch_size:5d} |' + # if wandb is not None and getattr(wandb, 'run', None) is not None: + wandb_metrics |= { + 'training/iteration': iteration, + 'training/iteration_time': elapsed_time_per_iteration, + 'training/iteration_time_vs_tokens': ( + (elapsed_time_per_iteration + / args.consumed_train_tokens) + ), + 'training/iteration_time_vs_samples': ( + (elapsed_time_per_iteration + / args.consumed_train_samples), + ), + 'training/consumed_samples': args.consumed_train_samples, + 'training/consumed_tokens': args.consumed_train_tokens, + } for key in total_loss_dict: if key not in [advanced_iters_key, skipped_iters_key, nan_iters_key]: avg = total_loss_dict[key].item() / \ float(max(1, total_loss_dict[advanced_iters_key])) if avg > 0.0: - log_string += ' {}: {:.6E} |'.format(key, avg) + log_string += ' {}={:.6f} |'.format(key, avg) total_loss_dict[key] = get_accelerator().FloatTensor([0.0]) - if wandb is not None and getattr(wandb, 'run', None) is not None: - wandb.log(wandb_metrics) if loss_scale is not None: - log_string += ' loss scale: {:.1f} |'.format(loss_scale) + log_string += ' loss_scale={:.1f} |'.format(loss_scale) + wandb_metrics |= {'loss/loss_scale': loss_scale} if grad_norm is not None: - log_string += ' grad norm: {:.3f} |'.format(grad_norm) + log_string += ' grad_norm={:.3f} |'.format(grad_norm) + wandb_metrics |= {'loss/grad_norm': grad_norm} if num_zeros_in_grad is not None: - log_string += ' num zeros: {:.1f} |'.format(num_zeros_in_grad) + log_string += ' num_zeros={:.1f} |'.format(num_zeros_in_grad) + wandb_metrics |= {'loss/num_zeros_in_grad': num_zeros_in_grad} if params_norm is not None: - log_string += ' params norm: {:.3f} |'.format(params_norm) + log_string += ' params_norm={:.3f} |'.format(params_norm) + wandb_metrics |= {'loss/params_norm': params_norm} if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: - log_string += ' curriculum seqlen: {:5d} |'.format(args.curriculum_seqlen) + log_string += ' curriculum_seqlen={:5d} |'.format(args.curriculum_seqlen) if args.random_ltd: - log_string += ' random ltd reserved length: {:5d} |'.format(args.random_ltd_reserved_length) - log_string += ' actual seqlen: {:5d} |'.format(seq_len) - log_string += ' number of skipped iterations: {:3d} |'.format( + log_string += ' random_ltd reserved_length={:5d} |'.format(args.random_ltd_reserved_length) + log_string += ' actual_seqlen={:5d} |'.format(seq_len) + log_string += ' number_of_skipped_iterations={:3d} |'.format( total_loss_dict[skipped_iters_key]) - log_string += ' number of nan iterations: {:3d} |'.format( + log_string += ' number_of_nan_iterations={:3d} |'.format( total_loss_dict[nan_iters_key]) - log_string += ' samples per second: {:.3f} |'.format(samples_per_sec) - log_string += ' tokens per gpu per second (tgs): {:.3f} |'.format(tokens_per_gpu_per_second) - log_string += ' TFLOPs: {:.2f} |'.format(tflops) + log_string += ' samples_per_second={:.3f} |'.format(samples_per_sec) + log_string += ' tokens_per_gpu_per_second_tgs={:.3f} |'.format(tokens_per_gpu_per_second) + log_string += ' TFLOPs={:.2f} |'.format(tflops) total_loss_dict[advanced_iters_key] = 0 total_loss_dict[skipped_iters_key] = 0 total_loss_dict[nan_iters_key] = 0 - print_rank_last(log_string) + # print_rank_last(log_string) + log.info(log_string) if report_memory_flag and learning_rate > 0.: # Report memory after optimizer state has been initialized. report_memory('(after {} iterations)'.format(iteration)) report_memory_flag = False - timers.log(timers_to_log, normalizer=args.log_interval) + if wandb is not None and getattr(wandb, 'run', None) is not None: + wandb_metrics |= {'training/skiped_iterations': total_loss_dict[skipped_iters_key]} + wandb_metrics |= {'training/nan_iterations': total_loss_dict[nan_iters_key]} + wandb.log(wandb_metrics) + if timers is not None: + timers.log(timers_to_log, normalizer=args.log_interval) return report_memory_flag @@ -1144,6 +1267,7 @@ def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler): timers = get_timers() # Extra barrier is added to make sure # all ranks report the max time. + # assert timers is not None timers('save-checkpoint', log_level=0).start(barrier=True) save_checkpoint(iteration, model, optimizer, opt_param_scheduler) timers('save-checkpoint').stop(barrier=True) @@ -1351,7 +1475,7 @@ def evaluate(forward_step_func, while iteration < args.eval_iters: iteration += 1 if verbose and iteration % args.log_interval == 0: - print_rank_0('Evaluating iter {}/{}'.format(iteration, + log.info('Evaluating iter {}/{}'.format(iteration, args.eval_iters)) forward_backward_func = get_forward_backward_func() @@ -1436,9 +1560,10 @@ def evaluate_and_print_results(prefix, forward_step_func, process_non_loss_data_func, config, verbose) string = ' validation loss at {} | '.format(prefix) for key in total_loss_dict: - string += '{} value: {:.6E} | '.format(key, total_loss_dict[key].item()) + string += f"{key} value={total_loss_dict[key].item():.6f}" ppl = math.exp(min(20, total_loss_dict[key].item())) - string += '{} PPL: {:.6E} | '.format(key, ppl) + string += f"{key} PPL={ppl:.6f}" + # string += '{} PPL={:.6f} | '.format(key, ppl) if writer and is_last_rank(): data_type = 'test' if test else 'validation' writer.add_scalar(f'lm-loss-validation/{key} {data_type}', @@ -1462,9 +1587,9 @@ def evaluate_and_print_results(prefix, forward_step_func, process_non_loss_data_func(collected_non_loss_data, iteration, writer) length = len(string) + 1 - print_rank_last('-' * length) - print_rank_last(string) - print_rank_last('-' * length) + log.info('-' * length) + log.info(string) + log.info('-' * length) def cyclic_iter(iter): @@ -1489,10 +1614,10 @@ def build_train_valid_test_datasets(build_train_valid_test_datasets_provider): train_val_test_num_samples = [train_samples, eval_iters * args.global_batch_size, test_iters * args.global_batch_size] - print_rank_0(' > datasets target sizes (minimum size):') - print_rank_0(' train: {}'.format(train_val_test_num_samples[0])) - print_rank_0(' validation: {}'.format(train_val_test_num_samples[1])) - print_rank_0(' test: {}'.format(train_val_test_num_samples[2])) + log.info(' > datasets target sizes (minimum size):') + log.info(' train: {}'.format(train_val_test_num_samples[0])) + log.info(' validation: {}'.format(train_val_test_num_samples[1])) + log.info(' test: {}'.format(train_val_test_num_samples[2])) # Build the datasets. return build_train_valid_test_datasets_provider(train_val_test_num_samples) @@ -1506,7 +1631,7 @@ def build_train_valid_test_data_loaders( (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None) - print_rank_0('> building train, validation, and test datasets ...') + log.info('> building train, validation, and test datasets ...') # Backward compatibility, assume fixed batch size. if args.iteration > 0 and args.consumed_train_samples == 0: diff --git a/megatron/utils.py b/megatron/utils.py index 97294070af..f6a293281a 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -10,8 +10,12 @@ from deepspeed.accelerator import get_accelerator if get_accelerator().device_name() == 'cuda': - from apex.multi_tensor_apply import multi_tensor_applier - import amp_C + try: + from apex.multi_tensor_apply import multi_tensor_applier + import amp_C + HAS_APEX = True + except Exception: + HAS_APEX = False from megatron import ( get_args, @@ -74,15 +78,14 @@ def calc_params_l2_norm(model): # Calculate norm dummy_overflow_buf = get_accelerator().IntTensor([0]) - if get_accelerator().device_name() == 'cuda': - + if get_accelerator().device_name() == 'cuda' and HAS_APEX: norm, _ = multi_tensor_applier( amp_C.multi_tensor_l2norm, dummy_overflow_buf, [params_data], False # no per-parameter norm ) - else : + else: norm = torch.norm(params_data,p=2.0) norm_2 = norm * norm # Sum across all model-parallel GPUs. diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py new file mode 100644 index 0000000000..94186fea83 --- /dev/null +++ b/pretrain_gpt_alcf.py @@ -0,0 +1,641 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Pretrain GPT""" + +from pathlib import Path +from mpi4py import MPI +import os +from rich import print +import torch +import math +from functools import partial +from megatron import get_args +# from megatron import print_rank_0 +from megatron import get_timers +from megatron import get_tokenizer +from megatron.core import mpu, tensor_parallel +from megatron.core.enums import ModelType +from megatron.data.gpt_dataset import build_train_valid_test_datasets +from megatron.model import GPTModel, GPTModelPipe +from megatron.training import pretrain +from megatron.utils import get_ltor_masks_and_position_ids +from megatron.utils import average_losses_across_data_parallel_group, update_rotary_pos_emb +from megatron.arguments import core_transformer_config_from_args +# from megatron.utils import ( +# # report_memory, +# # throughput_calculator, +# # checkpoint_throughput_calculator +# ) +# from pathlib import Path +import logging + +import deepspeed +from deepspeed.runtime.utils import see_memory_usage +from deepspeed.accelerator.real_accelerator import get_accelerator +import subprocess +import wandb + +# import time +from torch import nn +import torch.nn.functional as F +import ezpz as ez + + +# ---- [SETUP COMMS] ------------------------ +# if str(os.environ.get('LAUNCH_CMD', 'mpich')).lower() == 'mpich': +RANK = ez.setup_torch(backend="deepspeed", timeout=7200) +# else: +# RANK = ez.get_rank() +WORLD_SIZE = ez.get_world_size() +LOCAL_RANK = ez.get_local_rank() +DEVICE = ez.get_torch_device() +if torch.cuda.is_available(): + torch.cuda.set_device(LOCAL_RANK) +# ------------------------------------------- +# --- [TURN OFF LOGGER ON ALL RANK != 0] ---- +log = logging.getLogger(__name__) +log.setLevel("INFO") if RANK == 0 else log.setLevel("CRITICAL") +# ---- [SETUP WANDB FROM RANK 0] -------------- +WANDB_MODE = os.environ.get('WANDB_MODE', None) +DISABLE_WANDB = ( + WANDB_MODE is not None and str(WANDB_MODE).lower() == 'disabled' +) +if RANK == 0 and not DISABLE_WANDB: + project_name = ( + os.environ.get( + 'WB_PROJECT', # look for WB_PROJECT in env + os.environ.get( + 'WANDB_PROJECT', # look for WANDB_PROJECT in env + 'AuroraGPT' + ), + ) + ) + print('--------------------------------------------------') + print(f"Setting up W&B from: {RANK} with {project_name}") + print('--------------------------------------------------') + _ = ez.setup_wandb(project_name=project_name) + + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + log.info('building GPT model ...') + see_memory_usage("Before Building Model", force=True) + args = get_args() + config = core_transformer_config_from_args(args) + # if wandb.run is not None and RANK == 0: + # print(f"Updating WandB run: [{wandb.run.name}]({wandb.run.url})") + # try: + # wandb.run.config.update({"args": vars(args)}) + # except Exception: + # log.error( + # 'Unable to `wandb.run.config.update({"args": vars(args)})`' + # ) + # if wandb is not None and wandb.run is not None: + # assert wandb is not None and wandb.run is not None + # print(f'Updating {wandb.run.name=} at {wandb.run.url=}') + # wandb.run.config.update({'args': vars(args)}) + if RANK == 0: + git_ds_info() + if hasattr(mpu, 'get_sequence_data_parallel_group'): + dpg = mpu.get_sequence_data_parallel_group() + elif hasattr(mpu, 'get_data_parallel_group'): + dpg = mpu.get_data_parallel_group() + else: + dpg = None + with deepspeed.zero.Init( + data_parallel_group=dpg, + remote_device=( + None if args.remote_device == 'none' else args.remote_device + ), + config_dict_or_path=args.deepspeed_config_dict, + enabled=args.zero_stage == 3, + mpu=mpu + ): + if args.deepspeed and not args.no_pipeline_parallel: + model = GPTModelPipe( + config=config, + num_tokentypes=0, + parallel_output=True + ) + # This is a hack to give us a reference to + # get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + + # Precompute the attention mask and store it in args. + # This avoids having to pipeline it + # as an activation during training. + # The mask is constant, and thus we can reuse it. + attention_mask = torch.tril( + torch.ones( + (1, args.seq_length, args.seq_length), + device=get_accelerator().current_device_name() + ) + ).view(1, 1, args.seq_length, args.seq_length) + + # Convert attention mask to binary: + attention_mask = (attention_mask < 0.5) + if args.fp16: + attention_mask = attention_mask.half() + elif args.bf16: + attention_mask = attention_mask.bfloat16() + + # Attention mask must be bool. + args.attn_mask = attention_mask.to(torch.bool) + + # For prertaining, since sequence length is fixed, + # cache rotary embedding in args, to avoid communicating around + if args.use_rotary_position_embeddings: + update_rotary_pos_emb(args.seq_length) + + else: + model = GPTModel( + config=config, + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process + ) + + num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + log.info(80 * '-') + log.info(f"Number of parameters in model: {num_params}") + log.info(80 * '-') + see_memory_usage("After Building Model", force=True) + if wandb.run is not None: + tbdir = args.tensorboard_dir + # tbdir = args.getattr('tensorboard_dir', None) + if tbdir is not None: + log.info(f'Patching tensorboard from {tbdir}') + wandb.tensorboard.patch(root_logdir=tbdir) + + wandb.run.config.update({'num_params': num_params}) + if "args" not in wandb.run.config: + log.info( + f"Updating WandB run.config: [{wandb.run.name}]({wandb.run.get_url()})" + ) + try: + wandb.run.config.update( + {"args": dict(sorted(vars(args).items()))} + ) + except Exception: + log.error( + 'Unable to `wandb.run.config.update({"args": vars(args)})`' + ) + try: + wandb.run.watch( + model, + log='all', + log_graph=True, + ) + except Exception: + pass + return model + + +def get_batch(data_iterator): + """Generate a batch""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = ['text'] + datatype = torch.int64 + + data = next(data_iterator) if data_iterator is not None else None + # # Broadcast data. + # if data_iterator is not None: + # data = next(data_iterator) + # else: + # data = None + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Get the masks and postition ids. + skip_mask = args.use_flash_attn or args.use_flash_attn_triton + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + skip_mask) + + # For DS's sequence parallel + seq_parallel_world_size = mpu.get_sequence_parallel_world_size() + seq_parallel_world_rank = mpu.get_sequence_parallel_rank() + + # For Megatron's sequence parallel + if args.sequence_parallel: + seq_parallel_world_size = mpu.get_tensor_model_parallel_world_size() + seq_parallel_world_rank = mpu.get_tensor_model_parallel_rank() + seq_length = tokens.size(1) + + assert seq_length % seq_parallel_world_size == 0 + sub_seq_length = seq_length // seq_parallel_world_size + sub_seq_start = seq_parallel_world_rank * sub_seq_length + sub_seq_end = (seq_parallel_world_rank + 1) * sub_seq_length + + tokens = tokens[:, sub_seq_start:sub_seq_end] + position_ids = position_ids[:, sub_seq_start:sub_seq_end] + # For DS's sequence parallel + if mpu.get_sequence_parallel_world_size() > 1: + labels = labels[:, sub_seq_start:sub_seq_end] + + return tokens, labels, loss_mask, attention_mask, position_ids + + +def data_post_process(data, data_sampler_state_dict): + args = get_args() + if args.data_efficiency_curriculum_learning: + if 'seqlen_truncate' in data_sampler_state_dict['current_difficulties']: + args.data_efficiency_curriculum_learning_seqlen_type = 'seqlen_truncate' + current_seqlen = data_sampler_state_dict['current_difficulties']['seqlen_truncate'] + if current_seqlen < args.seq_length: + data['text'] = data['text'][:, :(current_seqlen+1)].contiguous() + elif 'seqlen_reshape' in data_sampler_state_dict['current_difficulties']: + args.data_efficiency_curriculum_learning_seqlen_type = 'seqlen_reshape' + current_seqlen = data_sampler_state_dict['current_difficulties']['seqlen_reshape'] + if current_seqlen < args.seq_length: + orig_num_token = torch.numel(data['text']) + reshape_len = (data['text'].size()[1] // (current_seqlen+1)) * (current_seqlen+1) + data['text'] = torch.cat((data['text'][:, :reshape_len].contiguous().view(-1, current_seqlen+1), + data['text'][:, -(current_seqlen+1):]), 0).contiguous() + num_row = math.ceil(orig_num_token / (current_seqlen+1)) + num_row = min(num_row, data['text'].size()[0]) + if num_row > 1 and num_row % 2 != 0: + num_row -= 1 + data['text'] = data['text'][:num_row, :].contiguous() + else: + args.data_efficiency_curriculum_learning_seqlen_type = None + return data + + +def get_batch_pipe(data): + """ + Modification of `get_batch` to work on `next(data_iterator)` + instead of `data_iterator` + """ + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = ['text'] + datatype = torch.int64 + + # Broadcast data. + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss) + if ( + args.curriculum_learning_legacy + and args.curriculum_seqlen < tokens.size()[1] + ): + # seqlen-based curriculum learning + # tokens, position_ids, labels, loss_mask + # have size [batch size, seqlen] + tokens = tokens[:, :args.curriculum_seqlen].contiguous() + position_ids = position_ids[:, :args.curriculum_seqlen].contiguous() + if labels is not None: + labels = labels[:, :args.curriculum_seqlen].contiguous() + loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous() + + return (tokens, position_ids, attention_mask), (labels, loss_mask) + + +def loss_func(loss_mask, moe_loss, mos_loss, output_tensor): + args = get_args() + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + if args.mos or args.kd: + # assert max(args.num_experts) >= 1 + loss = loss + moe_loss + mos_loss + if args.mos: + return loss, { + 'total loss': loss, + 'lm loss': averaged_loss[0], + 'moe loss': moe_loss, + 'mos loss': mos_loss + } + elif args.kd: + return loss, { + 'total loss': loss, + 'lm loss': averaged_loss[0], + 'moe loss': moe_loss, + 'kd loss': mos_loss + } + log.info( + f'>>> total loss: {loss}, ' + f'lm loss {averaged_loss[0]}, ' + f'kd loss {mos_loss}' + ) + else: + if max(args.num_experts) <= 1: + return loss, {'lm loss': averaged_loss[0]} + loss = loss + moe_loss + return loss, {'lm loss': averaged_loss[0], 'moe loss': moe_loss} + + +def calculate_mos_loss( + args, + stu_output, + teacher_model, + tokens, + position_ids, + attention_mask +): + mos_loss = 0 + alpha = args.kd_alpha_ce + beta = args.kd_beta_ce + kd_temp = args.kd_temp + + if teacher_model: + with torch.no_grad(): + if ( + args.curriculum_learning_legacy and + args.curriculum_seqlen < args.seq_length + ): + assert args.curriculum_seqlen is not None + curriculum_seqlen = args.curriculum_seqlen + tokens = tokens[:, :curriculum_seqlen].contiguous() + position_ids = position_ids[:, :curriculum_seqlen].contiguous() + csl = curriculum_seqlen + attention_mask = ( + attention_mask[:, :, :csl, :csl].contiguous() + ) + # No need to truncate labels + # as we do not need it for the teacher logits + tea_output, tea_other_losses = teacher_model( + tokens, + position_ids, + attention_mask + ) + assert stu_output.size() == tea_output.size(), ( + 'teacher and student output should match in size. ' + f'Student: {stu_output.size()}, ' + f'Teacher: {tea_output.size()}, ' + f'CL seq length {args.curriculum_seqlen}' + ) + + student_logits = F.log_softmax(stu_output / kd_temp, dim=2) + # The target logits is expected to be probabilities. + # If we use log_softmax, + # then we need to set target_log to true + # when initializing the KLDivLoss. + tea_logits = F.softmax(tea_output / kd_temp, dim=2) + + mos_loss = kd_temp * kd_temp * nn.KLDivLoss(reduction='batchmean')( + student_logits, + tea_logits + ) + + mos_loss = mos_loss.div(args.seq_length) * beta + return mos_loss + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator', log_level=2).start() + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator + ) + timers('batch-generator').stop() + + if args.data_efficiency_curriculum_learning: + args.curriculum_seqlen = tokens.size()[1] + if ( + hasattr( + args, + 'data_efficiency_curriculum_learning_seqlen_type') + and ( + args.data_efficiency_curriculum_learning_seqlen_type + == 'seqlen_reshape' + ) + ): + args.data_efficiency_curriculum_learning_numel = ( + torch.numel(tokens) + ) + + if args.mos or args.kd: + # The forward func can return either the loss or the logits, + # depending on whether passing in the labels or not. + stu_output, other_losses = model(tokens, position_ids, attention_mask) + if ( + args.curriculum_learning_legacy + and args.curriculum_seqlen < args.seq_length + ): + assert args.curriculum_seqlen is not None + labels = labels[:, :args.curriculum_seqlen].contiguous() + output_tensor = tensor_parallel.vocab_parallel_cross_entropy( + stu_output.contiguous().float(), + labels + ) + else: + output_tensor, other_losses = model( + tokens, + position_ids, + attention_mask, + labels=labels + ) + if ( + args.curriculum_learning_legacy and + args.curriculum_seqlen < args.seq_length + ): + loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous() + + moe_losses = [] + for moe_loss in other_losses: + if moe_loss is not None: + moe_losses.append(moe_loss) + moe_loss = sum(moe_losses) * args.moe_loss_coeff + + mos_loss = 0 + if args.mos or args.kd: + assert model.training + if args.teacher_forward and args.teacher_model is not None: + mos_loss = calculate_mos_loss( + args, + stu_output, + args.teacher_model[0], + tokens, + position_ids, + attention_mask + ) + + # Output_tensor stores the standard loss, + # loss_func calculates the total loss. + return output_tensor, partial(loss_func, loss_mask, moe_loss, mos_loss) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + + log.info( + '> building train, validation, and test datasets for GPT ...' + ) + files = [] + if args.data_file_list is not None: + log.info(f"Reading datasets from {args.data_file_list}") + with open(args.data_file_list, 'r') as flist: + for f in flist.readlines(): + w, fname = f.split() + files.append(float(w)) + files.append(fname) + elif len(args.data_path) == 1 and os.path.isdir(args.data_path[0]): + path = args.data_path[0] + "/" + for f in os.listdir(path): + if (os.path.isfile(path + f) and f.find(".bin") != -1): + files.append(1) + files.append(path + f.split(".bin")[0]) + else: + files = args.data_path + + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=files, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length, + seed=args.seed, + skip_warmup=True, + # skip_warmup=(not args.mmap_warmup), + train_data_prefix=args.train_data_path, + valid_data_prefix=args.valid_data_path, + test_data_prefix=args.test_data_path, + data_cache_path=args.data_cache_path, + ) + log.info("> finished creating GPT datasets ...") + + return train_ds, valid_ds, test_ds + + +def command_exists(cmd): + result = subprocess.Popen( + f'type {cmd}', + stdout=subprocess.PIPE, + shell=True + ) + return result.wait() == 0 + + +def git_ds_info(): + if RANK != 0: + return + from deepspeed.env_report import main as ds_report + ds_report() + + # Write out version/git info + git_hash_cmd = "git rev-parse --short HEAD" + git_branch_cmd = "git rev-parse --abbrev-ref HEAD" + if command_exists('git'): + try: + result = subprocess.check_output(git_hash_cmd, shell=True) + git_hash = result.decode('utf-8').strip() + result = subprocess.check_output(git_branch_cmd, shell=True) + git_branch = result.decode('utf-8').strip() + except subprocess.CalledProcessError: + git_hash = "unknown" + git_branch = "unknown" + else: + git_hash = "unknown" + git_branch = "unknown" + print( + f'**** Git info for Megatron: ' + f'git_hash={git_hash} git_branch={git_branch} ****' + ) + + +def main(): + if os.getenv('TORCH_PROFILER_ENABLED') == '1': + from torch.profiler import profile, record_function, ProfilerActivity + with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: + model = pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + data_post_process=data_post_process + ) + args = get_args() + assert args is not None + trace_output = Path(f"{args.tensorboard_dir}").joinpath( + f"torch-trace-{RANK}-of-{WORLD_SIZE}.json" + ) + prof.export_chrome_trace(trace_output.as_posix()) + log.info( + f'Saved trace output to: {trace_output.as_posix()}' + ) + else: + model = pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + data_post_process=data_post_process + ) + # try: + # from megatron.text_generation import generate_and_post_process + # with torch.autocast(device_type=DEVICE, dtype=args.dtype): + # response, _, _, _ = generate_and_post_process( + # model, + # prompts=[ + # "Hello world", + # "Nature is", + # "Turing test comprises", + # "Explain solar eclipse" + # ], + # tokens_to_generate=32 + # ) + # if RANK == 0: + # log.info(f'generation completed..\n response:{response}') + # except ValueError as ve: + # log.critical(f'ValueError: {ve}') + # pass + # dist.barrier() + # model.train() + return model + + +if __name__ == "__main__": + # git_ds_info() + # pretrain(train_valid_test_datasets_provider, + # model_provider, + # ModelType.encoder_or_decoder, + # forward_step, + # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + # data_post_process=data_post_process) + import sys + import deepspeed.comm as dist + model = main() + dist.log_summary() + if wandb.run is not None: + print(f"wandb.run.name: {wandb.run.name}") + print(f"wandb.run.url: {wandb.run.url}") + wandb.finish() + sys.exit(0) diff --git a/pretrain_llama.py b/pretrain_llama.py new file mode 100644 index 0000000000..ab7ffc785c --- /dev/null +++ b/pretrain_llama.py @@ -0,0 +1,589 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Pretrain GPT""" + +import os +import torch +import math + +# import logging + +from functools import partial +from megatron import get_args +from megatron import print_rank_0 +from rich import print +from megatron import get_timers +from megatron import get_tokenizer +from megatron.core import mpu, tensor_parallel +from megatron.core.enums import ModelType +from megatron.data.gpt_dataset import build_train_valid_test_datasets +from megatron.model import GPTModel, GPTModelPipe +from megatron.training import pretrain +from megatron.utils import get_ltor_masks_and_position_ids +from megatron.utils import ( + average_losses_across_data_parallel_group, + update_rotary_pos_emb, +) +from megatron.arguments import core_transformer_config_from_args + +import deepspeed +from deepspeed.runtime.utils import see_memory_usage +from deepspeed.accelerator.real_accelerator import get_accelerator +import subprocess +import wandb + +from torch import nn +import torch.nn.functional as F + +# from ezpz import get_logger +from ezpz.dist import setup_torch, get_world_size, setup_wandb + +RANK = setup_torch( + backend="deepspeed", + port="5432", +) +WORLD_SIZE = get_world_size() +LEVEL = "DEBUG" if RANK == 0 else "CRITICAL" + +WANDB_MODE = os.environ.get("WANDB_MODE", None) +DISABLE_WANDB = ( + WANDB_MODE is not None and str(WANDB_MODE).lower() == "disabled" +) + +if RANK == 0 and not DISABLE_WANDB: + # args = get_args() + # assert args is not None + # tensorboard_dir = args.tensorboard_dir + # if args.tensorboard_dir is not None: + # print(f'Setting (in env): {TENSORBOARD_DIR=}') + # os.environ['TENSORBOARD_DIR'] = args.tensorboard_dir + project_name = os.environ.get( + "WB_PROJECT", + os.environ.get("WANDB_PROJECT", "GenSLM-Megatron-DS"), + ) + print("--------------------------------------------------") + print(f"Setting up W&B from: {RANK} with {project_name}") + print("--------------------------------------------------") + setup_wandb(project_name=project_name) + + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + print_rank_0("building GPT model ...") + see_memory_usage("Before Building Model", force=True) + args = get_args() + assert args is not None + config = core_transformer_config_from_args(args) + # args = get_args() + # timers = get_timers() + if wandb.run is not None: + print(f"Updating WandB run: [{wandb.run.name}]({wandb.run.url})") + wandb.run.config.update({"args": vars(args)}) + if RANK == 0: + git_ds_info() + + with deepspeed.zero.Init( + sequence_data_parallel_group=mpu.get_sequence_data_parallel_group(), + remote_device=( + None if args.remote_device == "none" + else args.remote_device, + ), + config_dict_or_path=args.deepspeed_config, + enabled=args.zero_stage == 3, + mpu=mpu, + ): + if args.deepspeed and not args.no_pipeline_parallel: + model = GPTModelPipe( + config=config, + num_tokentypes=0, + parallel_output=True + ) + # This is a hack to give us a reference to get_batch_pipe from + # within training.py We need to call model.set_batch_fn after + # deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + + # Predompute the attention mask and store it in args. This avoids + # having to pipeline it as an activation during training. The mask + # is constant, and thus we can reuse it. + attention_mask = torch.tril( + torch.ones( + (1, args.seq_length, args.seq_length), + device=get_accelerator().current_device_name(), + ) + ).view(1, 1, args.seq_length, args.seq_length) + + # Convert attention mask to binary: + attention_mask = attention_mask < 0.5 + if args.fp16: + attention_mask = attention_mask.half() + elif args.bf16: + attention_mask = attention_mask.bfloat16() + + # Attention mask must be bool. + args.attn_mask = attention_mask.to(torch.bool) + + # For prertaining, since sequence length is fixed, cache rotary + # embedding in args, to avoid communicating around + if args.use_rotary_position_embeddings: + update_rotary_pos_emb(args.seq_length) + + else: + model = GPTModel( + config=config, + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + ) + num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + # print_rank_0('\n ------------------------ ') + # print_rank_0(f'num of parameters {num_params}') + # print_rank_0('------------------------\n ') + print_rank_0(80 * "-") + print_rank_0(f"Number of parameters in model: {num_params}") + print_rank_0(80 * "-") + see_memory_usage("After Building Model", force=True) + if wandb.run is not None: + wandb.run.watch( + model, + log="all", + log_graph=True, + ) + wandb.run.config.update({"num_params": num_params}) + return model + + +def get_batch(data_iterator): + """Generate a batch""" + args = get_args() + tokenizer = get_tokenizer() + assert args is not None + assert tokenizer is not None + # Items and their type. + keys = ["text"] + datatype = torch.int64 + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + # Unpack. + tokens_ = data_b["text"].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + # Get the masks and postition ids. + skip_mask = ( + hasattr(args, "use_flash_attn") + or hasattr(args, "flash_attn_triton") + ) + # skip_mask = args.use_flash_attn or args.use_flash_attn_triton + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + skip_mask, + ) + # For DS's sequence parallel + seq_parallel_world_size = mpu.get_sequence_parallel_world_size() + seq_parallel_world_rank = mpu.get_sequence_parallel_rank() + # For Megatron's sequence parallel + if args.sequence_parallel: + seq_parallel_world_size = mpu.get_tensor_model_parallel_world_size() + seq_parallel_world_rank = mpu.get_tensor_model_parallel_rank() + seq_length = tokens.size(1) + assert seq_length % seq_parallel_world_size == 0 + sub_seq_length = seq_length // seq_parallel_world_size + sub_seq_start = seq_parallel_world_rank * sub_seq_length + sub_seq_end = (seq_parallel_world_rank + 1) * sub_seq_length + tokens = tokens[:, sub_seq_start:sub_seq_end] + position_ids = position_ids[:, sub_seq_start:sub_seq_end] + # For DS's sequence parallel + if mpu.get_sequence_parallel_world_size() > 1: + labels = labels[:, sub_seq_start:sub_seq_end] + return tokens, labels, loss_mask, attention_mask, position_ids + + +def data_post_process(data, data_sampler_state_dict): + args = get_args() + assert args is not None + if args.data_efficiency_curriculum_learning: + if ( + "seqlen_truncate" in data_sampler_state_dict[ + "current_difficulties" + ] + ): + args.data_efficiency_curriculum_learning_seqlen_type = ( + "seqlen_truncate" + ) + current_seqlen = ( + data_sampler_state_dict["current_difficulties"][ + "seqlen_truncate" + ] + ) + if current_seqlen < args.seq_length: + data["text"] = ( + data["text"][:, : (current_seqlen + 1)].contiguous() + ) + elif ( + "seqlen_reshape" in data_sampler_state_dict["current_difficulties"] + ): + args.data_efficiency_curriculum_learning_seqlen_type = ( + "seqlen_reshape" + ) + current_seqlen = ( + data_sampler_state_dict["current_difficulties"][ + "seqlen_reshape" + ] + ) + if current_seqlen < args.seq_length: + orig_num_token = torch.numel(data["text"]) + reshape_len = ( + (data["text"].size()[1] // (current_seqlen + 1)) + * (current_seqlen + 1) + ) + data["text"] = torch.cat( + ( + data["text"][:, :reshape_len] + .contiguous() + .view(-1, current_seqlen + 1), + data["text"][:, -(current_seqlen + 1):], + ), + 0, + ).contiguous() + num_row = math.ceil(orig_num_token / (current_seqlen + 1)) + num_row = min(num_row, data["text"].size()[0]) + if num_row > 1 and num_row % 2 != 0: + num_row -= 1 + data["text"] = data["text"][:num_row, :].contiguous() + else: + args.data_efficiency_curriculum_learning_seqlen_type = None + return data + + +def get_batch_pipe(data): + """Modification of `get_batch` to work on `next(data_iterator)` instead of + `data_iterator`""" + args = get_args() + tokenizer = get_tokenizer() + assert tokenizer is not None and args is not None + + # Items and their type. + keys = ["text"] + datatype = torch.int64 + + # Broadcast data. + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b["text"].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + ) + if ( + args.curriculum_learning_legacy + and args.curriculum_seqlen < tokens.size()[1] + ): + # seqlen-based curriculum learning + # tokens, position_ids, labels, loss_mask have size: + # [batch size, seqlen] + tokens = tokens[:, : args.curriculum_seqlen].contiguous() + position_ids = position_ids[:, : args.curriculum_seqlen].contiguous() + if labels is not None: + labels = labels[:, : args.curriculum_seqlen].contiguous() + loss_mask = loss_mask[:, : args.curriculum_seqlen].contiguous() + + return (tokens, position_ids, attention_mask), (labels, loss_mask) + + +def loss_func(loss_mask, moe_loss, mos_loss, output_tensor): + args = get_args() + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + if args.mos or args.kd: # type:ignore + # assert max(args.num_experts) >= 1 + loss = loss + moe_loss + mos_loss + if args.mos: # type:ignore + # return loss, { + # "total loss": loss, + # "lm loss": averaged_loss[0], + # "moe loss": moe_loss, + # "mos loss": mos_loss, + # } + losses = { + "total loss": loss, + "lm loss": averaged_loss[0], + "moe loss": moe_loss, + "mos loss": mos_loss, + } + elif args.kd: # type:ignore + # return loss, { + # "total loss": loss, + # "lm loss": averaged_loss[0], + # "moe loss": moe_loss, + # "kd loss": mos_loss, + # } + losses = { + "total-loss": loss, + "lm-loss": averaged_loss[0], + "moe-loss": moe_loss, + "kd-loss": mos_loss, + } + print_rank_0( + ">>> total loss: {}, lm loss {}, kd loss {}".format( + loss, averaged_loss[0], mos_loss + ) + ) + else: + if max(args.num_experts) <= 1: # type:ignore + losses = {"lm-loss": averaged_loss[0]} + # return loss, {"lm loss": averaged_loss[0]} + else: + loss = loss + moe_loss + losses = {"lm-loss": averaged_loss[0], "moe loss": moe_loss} + # return loss, {"lm loss": averaged_loss[0], "moe loss": moe_loss} + if wandb is not None and wandb.run is not None: + # wandb.run.log({}) + losses |= {'loss': loss} + wandb.run.log({f"Loss/{k}": v for k, v in losses.items()}) + return loss, losses + + +def calculate_mos_loss( + args, + stu_output, + teacher_model, + tokens, + position_ids, + attention_mask, +): + mos_loss = 0 + alpha = args.kd_alpha_ce + beta = args.kd_beta_ce + kd_temp = args.kd_temp + + if teacher_model: + with torch.no_grad(): + if ( + args.curriculum_learning_legacy + and args.curriculum_seqlen < args.seq_length + ): + assert args.curriculum_seqlen is not None + curriculum_seqlen = args.curriculum_seqlen + tokens = tokens[:, :curriculum_seqlen].contiguous() + position_ids = position_ids[:, :curriculum_seqlen].contiguous() + attention_mask = attention_mask[ + :, :, :curriculum_seqlen, :curriculum_seqlen + ].contiguous() + # No need to truncate labels as we do not need it for the + # teacher logits + tea_output, tea_other_losses = teacher_model( + tokens, position_ids, attention_mask + ) + assert ( + stu_output.size() == tea_output.size() + ), ( + "teacher and student output should match in size. " + f"Student: {stu_output.size()}, " + f"Teacher: {tea_output.size()}, " + f"CL seq length {args.curriculum_seqlen}" + ) + + student_logits = F.log_softmax(stu_output / kd_temp, dim=2) + tea_logits = F.softmax( + tea_output / kd_temp, dim=2 + ) + # The target logits is expected to be probabilities. If we use + # log_softmax, then we need to set target_log to true when initializing + # the KLDivLoss. + mos_loss = ( + kd_temp + * kd_temp + * nn.KLDivLoss(reduction="batchmean")(student_logits, tea_logits) + ) + + mos_loss = mos_loss.div(args.seq_length) * beta + return mos_loss + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + assert timers is not None and args is not None + + # Get the batch. + timers("batch-generator", log_level=2).start() + tokens, labels, loss_mask, attention_mask, position_ids = ( + get_batch(data_iterator) + ) + timers("batch-generator").stop() + + if args.data_efficiency_curriculum_learning: # type: ignore + args.curriculum_seqlen = tokens.size()[1] # type: ignore + if ( + hasattr( + args, + "data_efficiency_curriculum_learning_seqlen_type" + ) + and ( + args.data_efficiency_curriculum_learning_seqlen_type + == "seqlen_reshape" + ) + ): + args.data_efficiency_curriculum_learning_numel = ( + torch.numel(tokens) + ) + + assert args is not None + if args.mos or args.kd: # type:ignore + # The forward func can return either the loss or the logits, depending + # on whether passing in the labels or not. + stu_output, other_losses = model(tokens, position_ids, attention_mask) + if ( + args.curriculum_learning_legacy # type:ignore + and args.curriculum_seqlen < args.seq_length + ): + assert args.curriculum_seqlen is not None + labels = labels[:, : args.curriculum_seqlen].contiguous() + output_tensor = tensor_parallel.vocab_parallel_cross_entropy( + stu_output.contiguous().float(), labels + ) + else: + output_tensor, other_losses = model( + tokens, position_ids, attention_mask, labels=labels + ) + if ( + args.curriculum_learning_legacy + and args.curriculum_seqlen < args.seq_length + ): + loss_mask = loss_mask[:, : args.curriculum_seqlen].contiguous() + + moe_losses = [] + for moe_loss in other_losses: + if moe_loss is not None: + moe_losses.append(moe_loss) + moe_loss = sum(moe_losses) * args.moe_loss_coeff + + mos_loss = 0 + if args.mos or args.kd: + assert model.training + if args.teacher_forward and args.teacher_model is not None: + mos_loss = calculate_mos_loss( + args, + stu_output, + args.teacher_model[0], + tokens, + position_ids, + attention_mask, + ) + + # Output_tensor stores the standard loss, loos_func calculates the total + # loss. + return output_tensor, partial(loss_func, loss_mask, moe_loss, mos_loss) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + assert args is not None + print_rank_0( + "> building train, validation, and test datasets " "for GPT ..." + ) + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length, + seed=args.seed, + skip_warmup=(not args.mmap_warmup), + train_data_prefix=args.train_data_path, + valid_data_prefix=args.valid_data_path, + test_data_prefix=args.test_data_path, + data_cache_path=args.data_cache_path, + ) + print_rank_0("> finished creating GPT datasets ...") + + return train_ds, valid_ds, test_ds + + +def command_exists(cmd): + result = subprocess.Popen( + f"type {cmd}", + stdout=subprocess.PIPE, + shell=True + ) + return result.wait() == 0 + + +def git_ds_info(): + from deepspeed.env_report import main as ds_report + + ds_report() + + # Write out version/git info + git_hash_cmd = "git rev-parse --short HEAD" + git_branch_cmd = "git rev-parse --abbrev-ref HEAD" + if command_exists("git"): + try: + result = subprocess.check_output(git_hash_cmd, shell=True) + git_hash = result.decode("utf-8").strip() + result = subprocess.check_output(git_branch_cmd, shell=True) + git_branch = result.decode("utf-8").strip() + except subprocess.CalledProcessError: + git_hash = "unknown" + git_branch = "unknown" + else: + git_hash = "unknown" + git_branch = "unknown" + print( + f"**** Git info for Megatron: " + f"git_hash={git_hash} git_branch={git_branch} ****" + ) + + +def main(): + model = pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={"tokenizer_type": "GPT2BPETokenizer"}, + data_post_process=data_post_process, + ) + return model + + +if __name__ == "__main__": + # git_ds_info() + # pretrain(train_valid_test_datasets_provider, + # model_provider, + # ModelType.encoder_or_decoder, + # forward_step, + # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + # data_post_process=data_post_process) + import sys + import deepspeed.comm as dist + + model = main() + dist.log_summary() + if wandb.run is not None: + print(f"wandb.run.name: {wandb.run.name}") + print(f"wandb.run.url: {wandb.run.url}") + wandb.finish() + sys.exit() diff --git a/train_aGPT_7B.sh b/train_aGPT_7B.sh new file mode 100644 index 0000000000..9dc0f1d946 --- /dev/null +++ b/train_aGPT_7B.sh @@ -0,0 +1,30 @@ +#!/bin/bash --login + + +NOW="$(date "+%Y-%m-%d-%H%M%S")" +cd "${PBS_O_WORKDIR}" || exit + +HOSTNAME=$(hostname) +if [[ "${HOSTNAME}" == x3* ]]; then + MACHINE="polaris" +elif [[ "${HOSTNAME}" == x1* ]]; then + MACHINE="sunspot" +elif [[ "${HOSTNAME}" == x4* ]]; then + MACHINE="aurora" +fi + +OUTDIR="${PBS_O_WORKDIR}/pbslogs" && mkdir -p "${OUTDIR}" +OUTFILE="${OUTDIR}/${PBS_JOBID}-${NOW}.log" + +echo "+---------------------------------------------------------+" +echo "| Running on: ${MACHINE}" +echo "| Detected ${nhosts} hosts. Running with micro batch: ${MBS}" +echo "| Logging job output to: ${OUTFILE}" +echo "+---------------------------------------------------------+" + +export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=6000 +echo "${OUTFILE}" >> "${OUTDIR}/latest" +# export DEBUG=1 +# export MICRO_BATCH="${MICRO_BATCH:-${MBS}}" +export DATA_FILE_LIST="${DATA_FILE_LIST:-${PBS_O_WORKDIR}/ALCF/data-lists/${MACHINE}/dolma_v1_7_file_list.txt}" +bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" diff --git a/train_agpt.sh b/train_agpt.sh new file mode 100644 index 0000000000..a21b988073 --- /dev/null +++ b/train_agpt.sh @@ -0,0 +1,12 @@ +#!/bin/bash --login + +NOW="$(date "+%Y-%m-%d-%H%M%S")" +cd "${PBS_O_WORKDIR}" || exit + +OUTDIR="${PBS_O_WORKDIR}/pbslogs" +mkdir -p "${OUTDIR}" +OUTFILE="${OUTDIR}/${PBS_JOBID}-${NOW}.log" +echo "${OUTFILE}" >> "${OUTDIR}/latest" +echo "Logging job output to: ${OUTFILE}" +# export DEBUG=1 +bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" diff --git a/train_agpt_polaris_7B_production.sh b/train_agpt_polaris_7B_production.sh new file mode 100644 index 0000000000..f83b6ebc29 --- /dev/null +++ b/train_agpt_polaris_7B_production.sh @@ -0,0 +1,29 @@ +#!/bin/bash --login +# +# This script can be submitted with `qsub` via: +# +# ```bash +# $ git clone https://github.com/argonee-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed +# $ qsub train_agpt_polaris_7B_production.sh +# ``` + +cd "${PBS_O_WORKDIR}" || exit + +TODAY="$(date "+%Y-%m-%d")" +NOW="$(date "+%Y-%m-%d-%H%M%S")" +OUTDIR="${PBS_O_WORKDIR}/pbslogs/${TODAY}" +OUTFILE="${OUTDIR}/${PBS_JOBID}-${NOW}.log" +mkdir -p $(dirname "${OUTFILE}") + +echo "${OUTFILE}" >> "$(dirname ${OUTDIR})/latest" +echo "Logging job output to: ${OUTFILE}" + +# export DEBUG=1 +# export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=6000 + +# Path to the data file list: +DFL="${PBS_O_WORKDIR}/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt" + +# Launch: +MICRO_BATCH=2 DATA_FILE_LIST="${DFL}" bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh new file mode 100644 index 0000000000..bf346e2144 --- /dev/null +++ b/train_llama_alcf.sh @@ -0,0 +1,161 @@ +#!/bin/bash --login +#PBS -l walltime=06:00:00 +#PBS -A argonne_tpc +#PBS -q prod +#PBS -l select=48 +#PBS -l filesystems=eagle:home + + +#### Make it easy to track experiments by date ################### +year="$(date "+%Y")" +month="$(date "+%m")" +day="$(date "+%Y-%m-%d")" +today="$(date "+%Y-%m-%d")" # kept for backwards compatibility +started_at="$(date "+%Y-%m-%d-%H%M%S")" +export YEAR="${year}" +export MONTH="${month}" +export DAY="${day}" +export TODAY="${today}" +export STARTED_AT="${started_at}" +################################################################## + + +############################################################################# +# Check if running in `DEBUG=1` mode. +# - If so, this will print each command before it is ran and exit if any of +# them return a nonzero exit status. +############################################################################# +if [[ -n "${DEBUG-}" ]]; then # to use: `DEBUG=1 bash train_llama_alcf.sh` + printf "\e[1;31m%s\e[0m\n" "!! RUNNING IN DEBUG MODE !!" + set -euxo pipefail +fi + +if [[ -v NOOP ]]; then # to use: `NOOP=1 bash train_llama_alcf.sh` + echo "Run NOOP mode" + set -o noexec # same as set -n +fi + +################################################## +# Helper function for `source`-ing another file +################################################## +sourceFile() { + fp="$1" + echo "source-ing ${fp}" + if [[ -f "${fp}" ]]; then + # shellcheck source="${fp}" + source "${fp}" + else + echo "ERROR: UNABLE TO SOURCE ${fp}" + fi +} + +############################################################################## +###################### MAIN LOGIC ############################################ +# ----[0. Navigate into `$PBS_O_WORKDIR`]-------------------------------------- +cd "${PBS_O_WORKDIR}" || exit +HERE=$(python3 -c 'import os; print(os.getcwd())') +export HERE + +# ----[1. Assert `./pretrain_gpt_alcf.py` exists:]----------------------------- +export EXEC="${HERE}/pretrain_gpt_alcf.py" +[ -f "${EXEC}" ] || exit + +# ----[1.5 Keep track of ] +exec_stem=$(echo "${EXEC}" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.py//g") +export EXEC_STEM="${exec_stem}" + + +# ----[2. `source ./ALCF/helpers_alcf.sh`:]------------------------------------ +sourceFile "${HERE}/ALCF/helpers.sh" || exit + +# ----[3. Call fns from `./ALCF/helpers_alcf.sh`]------------------------------ +get_machine || exit # 01. Identify machine we're on +setEnv || exit # 02. Load `conda` environment +# saveDSenv || exit # 03. Save env vars to `.deepspeed_env` +ezpz || exit # 04. Determine WORLD_SIZE, etc. from `PBS_*` vars +setParams || exit # 05. Set command line arguments to pass to `"${EXEC}"` +buildDSconfig || exit # 06. Create `deepspeed_config.json` from runtime params from ^ +setOutput || exit # 07. Specify output directory for {logs, checkpoints, etc.} +setArgs || exit # 08. Specify additional `deepspeed` arguments +dfl="${DATA_FILE_LIST:-}" # 09. Setup data + tokenizer +tok="${TOKENIZER_TYPE:-Llama2}" # via `DATA_FILE_LIST` and `TOKENIZER_TYPE` +setup_tokenizer_and_data "${tok}" "${dfl}" || exit +printJobInfo || exit # 10. Print job info +setupLauncher || exit # 11. set launcher to one of `MPICH` (default), or `deepspeed` +save_dotenv "${CKPT_DIR}" || exit # 12. Print info about loaded modules and runtime environment +check_and_kill_if_running || exit # 13. Check that were not already running, if so, exit. +# ----------------------------------------------------------------------------- +############################################################################## + +################################################ +# Assert `$TBDIR` exists inside our `$CKPT_DIR` +# to ensure metrics are tied to checkpoint +################################################ +TBDIR="${CKPT_DIR}/tensorboard" +mkdir -p "${TBDIR}" + +data_cache_path="${CKPT_DIR}/${DATA_CACHE_PATH}" && mkdir -p "${data_cache_path}" +echo "Using data_cache_path: ${data_cache_path}" + + +export DEFAULTS="\ + --split 100,0,0 \ + --log-interval 1 \ + --no-bias-gelu-fusion \ + --no-bias-dropout-fusion \ + --no-masked-softmax-fusion \ + --no-gradient-accumulation-fusion \ + --accumulate-allreduce-grads-in-fp32 \ + --use-checkpoint-opt_param-scheduler \ + --log-timers-to-tensorboard \ + --log-optimizer-states-to-tensorboard" + +# Take custom args +custom_args=" $@" + + # --log-num-zeros-in-grad \ + # --log-memory-to-tensorboard \ + # --data-file-list ${DATA_FILE_LIST} \ + # --data-file-list ${DATA_FILE_LIST} \ + # --tokenizer-type Llama2Tokenizer \ + # --tokenizer-model ${TOKENIZER_MODEL} \ +run_cmd=" + ${LAUNCHER} \ + --${DTYPE} \ + ${DEFAULTS} \ + --optimizer ${OPT} \ + --save ${CKPT_DIR} \ + --load ${CKPT_DIR} \ + --seq-length ${SEQ} \ + --num-layers ${NLAYERS} \ + --hidden-size ${HIDDEN} \ + --train-iters ${TRAIN_ITER} \ + --tensorboard-dir ${TBDIR} \ + --eval-iters ${EVAL_ITERS} \ + --distributed-backend ${BE} \ + --num-attention-heads ${HEADS} \ + --save-interval ${SAVE_INTERVAL} \ + --eval-interval ${EVAL_INTERVAL} \ + --max-position-embeddings ${SEQ} \ + --micro-batch-size ${MICRO_BATCH} \ + --tensor-model-parallel-size ${TP} \ + --global-batch-size ${GLOBAL_BATCH} \ + --pipeline-model-parallel-size ${PP} \ + --num-key-value-heads ${NUM_KV_HEAD} \ + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + --data-cache-path ${data_cache_path} \ + ${DATA_FLAGS} \ + ${LR_ARGS} \ + ${LLAMA_ARGS} \ + ${TIMING_STR} \ + ${TOKENIZER_FLAGS} \ + $ds_args \ + ${gpt_args[*]} \ + $custom_args \ + |& tee ${OUTPUT_LOG} + " + +echo "${run_cmd}" +printf "[!! %s] View output at:\n %s\n" "$(printBlue "NOTE")" "$(printYellow ${OUTPUT_LOG})" +eval "${run_cmd}" +set +x diff --git a/train_llama_alcf_aurora_qsub.sh b/train_llama_alcf_aurora_qsub.sh new file mode 100755 index 0000000000..6f247da9c8 --- /dev/null +++ b/train_llama_alcf_aurora_qsub.sh @@ -0,0 +1,7 @@ +#!/bin/bash --login + + +cd "${PBS_O_WORKDIR}" || exit +eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate anl_release_q4v2 +source /home/foremans/anl_24_release_q4/llm.devkit/setenv.sh +bash ./train_llama_alcf_aurora.sh diff --git a/train_llama_alcf_polaris_hzheng.sh b/train_llama_alcf_polaris_hzheng.sh new file mode 100755 index 0000000000..83d8a2c5a7 --- /dev/null +++ b/train_llama_alcf_polaris_hzheng.sh @@ -0,0 +1,76 @@ +#!/bin/bash +#PBS -l walltime=0:30:00 +#PBS -A datascience +#PBS -q debug-scaling +#PBS -l select=2 +#PBS -l filesystems=eagle:grand:home +cd ${PBS_O_WORKDIR} +export PPN=4 +export MD=/eagle/argonne_tpc/soft/Megatron-DeepSpeed +source /eagle/argonne_tpc/soft/conda.sh + +export PBS_JOBSIZE=$(cat $PBS_NODEFILE | uniq | wc -l) +export TP=1 +export PP=1 +export MBS=1 +export BS=$((MBS*PBS_JOBSIZE*PPN/PP/TP)) +export SP=$((PBS_JOBSIZE*PPN/PP/TP)) +export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") +export DATA_FILE_LIST="/eagle/datasets//dolma/data_file_list_reweighted.txt" +echo "BS: $BS - PP:$PP - TP: $TP, PBS_JOBSIZE: $PBS_JOBSIZE" +# First time running, it will compile the fused kernels, which will take about 10 mins +# >>> done with compiling and loading fused kernels. Compilation time: 545.468 seconds + +HIDDEN_SIZE=4096 +NUM_LAYERS=32 +SEQ_LENGTH=2048 +EMBEDDINGS=2048 +TRAIN_ITERS=10 +ZERO_STAGE=2 +MODEL=LLAMA_7B +#LAUNCHER="//eagle/argonne_tpc/soft/Megatron-DeepSpeed/..//conda/2024-03-11/lib/python3.10/site-packages/deepspeed/launcher/launcher_helper.py --launcher mpich " +OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} +APRUN_PMI=pmix aprun -n $((PBS_JOBSIZE*PPN)) -N $PPN --cc depth -d 16 /eagle/argonne_tpc/soft/Megatron-DeepSpeed/local_rank.sh python3 $LAUNCHER ./pretrain_gpt_alcf.py \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --num-layers ${NUM_LAYERS} \ + --hidden-size ${HIDDEN_SIZE} \ + --ffn-hidden-size 5504 \ + --num-attention-heads 32 \ + --micro-batch-size ${MBS} \ + --global-batch-size ${BS} \ + --seq-length ${SEQ_LENGTH} \ + --max-position-embeddings ${EMBEDDINGS} \ + --train-iters 10 \ + --save ${MD}/checkpoints/${OUTPUT_PREFIX} \ + --load ${MD}/checkpoints/${OUTPUT_PREFIX} \ + --tokenizer-type Llama2Tokenizer \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 3e-4 \ + --lr-decay-style cosine \ + --min-lr 3e-5 \ + --weight-decay 0.1 \ + --clip-grad 1 \ + --lr-warmup-iters 2 \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --log-interval 1 \ + --cpu-optimizer \ + --save-interval 10000 \ + --eval-interval 1000 \ + --eval-iters 10 --fp16 \ + --no-query-key-layer-scaling \ + --attention-dropout 0 \ + --hidden-dropout 0 \ + --use-rotary-position-embeddings \ + --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ + --untie-embeddings-and-output-weights \ + --swiglu --normalization layernorm --disable-bias-linear --num-key-value-heads 4 \ + --tensorboard-dir ${MD}/outputs/${OUTPUT_PREFIX}/tensorboard --log-timers-to-tensorboard --tensorboard-log-interval 1 \ + --data-file-list ${DATA_FILE_LIST} \ + --data-path ${DATA_PATH} \ + --vocab-file ${MD}/dataset/gpt2-vocab.json --merge-file ${MD}/dataset/gpt2-merges.txt \ + --zero-stage=${ZERO_STAGE} --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed \ + --data-cache-path ./data_cache_path/ diff --git a/train_llama_nersc_perlmutter.sh b/train_llama_nersc_perlmutter.sh new file mode 100644 index 0000000000..8131579809 --- /dev/null +++ b/train_llama_nersc_perlmutter.sh @@ -0,0 +1,141 @@ +#!/bin/bash --login +#SBATCH -A m4388_g +#SBATCH -C 'gpu&hbm80g' +#SBATCH -q regular +#SBATCH -t 00:30:00 +#SBATCH --nodes 128 +#SBATCH --gpus 512 +# + +function sourceFile() { + fp="$1" + echo "source-ing ${fp}" + if [[ -f "${fp}" ]]; then + # shellcheck source="${fp}" + source "${fp}" + else + echo "ERROR: UNABLE TO SOURCE ${fp}" + fi +} + +# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +# ---- 0. Navigate into `$PBS_O_WORKDIR` ------------------------------------- +# cd "${PBS_O_WORKDIR}" || exit +cd "${SLURM_SUBMIT_DIR}" || exit +HERE=$(python3 -c 'import os; print(os.getcwd())') +export HERE +# dflfb="${HERE}/genslm-subsample.txt" +# ---- 1. Assert `./pretrain_gpt_alcf.py` exists: ----------------------------- +export EXEC="${HERE}/pretrain_gpt_alcf.py" +[ -f "${EXEC}" ] || exit +# ---- 2. `source ./ALCF/helpers_alcf.sh`: ------------------------------------ +sourceFile "${HERE}/ALCF/helpers.sh" || exit +# ---- 3. Call fns from `./ALCF/helpers_alcf.sh` ------------------------------ +setEnv || exit # 1. load `conda` environment +saveDSenv || exit # 2. save env vars to `.deepspeed_env` +ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars +makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` +setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` +buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ +setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} +setArgs || exit # 8. specify additional `deepspeed` arguments +setData "${DATA_FILE_LIST:-${dflfb}}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset +setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` +printJobInfo || exit # 11. print job info +# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +# Take custom args +custom_args=" $@" + +# Assert `./hostfile_deepspeed` exists +export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit +TBDIR="${CKPT_DIR}/tensorboard" +mkdir -p "${TBDIR}" + +# source "${HERE}/venvs/polaris/2024-03-14/bin/activate" || exit +# echo "Using $(which python3)" +# --launcher_args='--pmi=pmix' + # deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} \ + # ${launch_cmd} \ + # --optimizer adam \ + # --use-flash-attn-v2 \ + # deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ +# source ezpz/src/ezpz/bin/getjobenv || exit +# if [[ -z "${DIST_LAUNCH}" ]]; then +# setupSrun || exit +# echo "Using SRUN_EXEC: ${SRUN_EXEC}" +# else +# SRUN_EXEC="${DIST_LAUNCH}" +# fi +# echo "Using SRUN_EXEC: ${SRUN_EXEC}" +# +export NHOSTS="${SLURM_NNODES:-1}" +export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" +export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" +export SRUN_EXEC="srun --gpus ${NGPUS} --gpus-per-node ${NGPU_PER_HOST} -N ${NHOSTS} -n ${NGPUS} -l -u --verbose" + + # srun --gpus ${NGPUS} \ + # --gpus-per-node ${NGPU_PER_HOST} \ + # -N ${NHOSTS} \ + # -n ${NGPUS} \ + # -l -u --verbose python3 ${EXEC} \ +run_cmd=" + ${SRUN_EXEC} python3 ${EXEC} \ + --$DTYPE \ + --optimizer ${OPT} \ + --num-workers 0 \ + --split 100,0,0 \ + --log-interval 1 \ + --no-bias-gelu-fusion \ + --lr-decay-style cosine \ + --no-bias-dropout-fusion \ + --no-masked-softmax-fusion \ + --tokenizer-type Llama2Tokenizer \ + --no-gradient-accumulation-fusion \ + --accumulate-allreduce-grads-in-fp32 \ + --use-checkpoint-opt_param-scheduler \ + --tensorboard-dir ${TBDIR} \ + --log-timers-to-tensorboard \ + --log-optimizer-states-to-tensorboard \ + --lr ${LR} \ + --save ${CKPT_DIR} \ + --load ${CKPT_DIR} \ + --seq-length ${SEQ} \ + --num-layers ${NLAYERS} \ + --hidden-size ${HIDDEN} \ + --train-iters ${TRAIN_ITER} \ + --eval-iters ${EVAL_ITERS} \ + --distributed-backend ${BE} \ + --num-attention-heads ${HEADS} \ + --save-interval ${SAVE_INTERVAL} \ + --eval-interval ${EVAL_INTERVAL} \ + --max-position-embeddings ${SEQ} \ + --micro-batch-size ${MICRO_BATCH} \ + --data-file-list ${DATA_FILE_LIST} \ + --tensor-model-parallel-size ${TP} \ + --global-batch-size ${GLOBAL_BATCH} \ + --pipeline-model-parallel-size ${PP} \ + --num-key-value-heads ${NUM_KV_HEAD} \ + --data-cache-path ${DATA_CACHE_PATH} \ + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + --tokenizer-model ${TOKENIZER_MODEL} \ + ${LLAMA_ARGS} \ + $ds_args \ + ${gpt_args[*]} \ + $custom_args \ + |& tee ${OUTPUT_LOG} + " + +run_cmd=$(echo "${run_cmd}" | sed -e 's/ */ /g') + +# echo "All DeepSpeed(s): $(which -a deepspeed)" +echo "! Using $(which deepspeed)" +ds_report + +echo "${run_cmd}" + +printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" +printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" +# echo "${OUTPUT_LOG}" +eval "${run_cmd}" +set +x