From da3d012b66796ed59ce3d5f1240e53856ca56a42 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 22 Aug 2023 08:24:43 -0500 Subject: [PATCH 001/268] Adds `ALCF/*` --- ALCF/args.sh | 647 +++++++++++++++++++++++++++++ ALCF/benchmark_train.sh | 82 ++++ ALCF/launch.sh | 151 +++++++ ALCF/model.sh | 264 ++++++++++++ ALCF/mpi_wrapper | 27 ++ ALCF/setup.sh | 228 ++++++++++ ALCF/submit-pbs.sh | 263 ++++++++++++ ALCF/submit.sh | 66 +++ ALCF/sweep_args_theta_multinode.sh | 164 ++++++++ ALCF/train-gpt3.sh | 71 ++++ 10 files changed, 1963 insertions(+) create mode 100755 ALCF/args.sh create mode 100755 ALCF/benchmark_train.sh create mode 100755 ALCF/launch.sh create mode 100755 ALCF/model.sh create mode 100755 ALCF/mpi_wrapper create mode 100755 ALCF/setup.sh create mode 100755 ALCF/submit-pbs.sh create mode 100755 ALCF/submit.sh create mode 100755 ALCF/sweep_args_theta_multinode.sh create mode 100755 ALCF/train-gpt3.sh diff --git a/ALCF/args.sh b/ALCF/args.sh new file mode 100755 index 0000000000..f452d3b7aa --- /dev/null +++ b/ALCF/args.sh @@ -0,0 +1,647 @@ +#!/bin/bash -login + +# SCRIPT_PATH="${BASH_SOURCE[0]}" +# while [ -L "$SCRIPT_PATH" ]; do +# SCRIPT_DIR="$(cd -P "$(dirname "$SCRIPT_PATH")" >/dev/null 2>&1 && pwd)" +# SCRIPT_PATH="$(readlink "$SCRIPT_PATH")" +# [[ ${SCRIPT_PATH} != /* ]] && SCRIPT_PATH="${SCRIPT_DIR}/${SCRIPT_PATH}" +# done +# SCRIPT_PATH="$(readlink -f "$SCRIPT_PATH")" +# SCRIPT_DIR="$(cd -P "$(dirname -- "$SCRIPT_PATH")" >/dev/null 2>&1 && pwd)" +# +# SOURCE=${BASH_SOURCE[0]} +# while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# SOURCE=$(readlink "$SOURCE") +# [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +# done +# DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" + +function FindMegatron() { + MEGATRON_INSTALL=$(python3 -c 'import megatron; print(megatron.__file__)' | tail -1) + MEGATRON_DIR=$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1))) +} + +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' +} + +HERE=$(WhereAmI) +ALCF_DIR=$(find "${HERE}" -name "ALCF") + +# [ "${MEGATRON_DIR}" ] && echo "Caught ${MEGATRON_DIR} from env" || FindMegatron +# ALCF_DIR="${MEGATRON_DIR}/ALCF" +# ALCF_DIR="$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1)))/ALCF" +PARENT=$(dirname "${ALCF_DIR}") +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "PARENT: ${PARENT}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + +USER=$(whoami) + +# # DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd -LP) +# PARENT=$(dirname "$DIR") +# +# HERE=$(python3 -c 'import os; print(os.getcwd())') + +# echo "------------------------" +# echo "SCRIPT_DIR=$SCRIPT_DIR" +# echo "SCRIPT_PATH=$SCRIPT_PATH" +# echo "------------------------" +# echo "SOURCE=$SOURCE" +# echo "DIR=$DIR" +# echo "PARENT: ${PARENT}" +# echo "HERE: ${HERE}" +# echo "------------------------" + +if [[ $(hostname) == theta* ]]; then + echo "Setting up ThetaGPU from $(hostname)" + HOSTFILE="${COBALT_NODEFILE}" +elif [[ $(hostname) == x* ]]; then + echo "Setting up Polaris from $(hostname)" + HOSTFILE="${PBS_NODEFILE}" +else + echo "Unexpected hostname $(hostname)" +fi + +NHOSTS=$(wc -l < "${HOSTFILE}") +NGPU_PER_HOST=$(nvidia-smi -L | wc -l) +NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" +WORLD_SIZE="${NGPUS}" +PARALLEL_SIZE="${WORLD_SIZE}" +# # NGPUS="$((${NHOSTS}*${NGPU_PER_HOST}))" +echo "NHOSTS * (NGPU / HOST) = $NHOSTS * $NGPU_PER_HOST = $NGPUS" + + +export MODEL_SIZE_KEY="${MODEL_SIZE_KEY:-GPT13B}" +echo "==========================+" +echo "Using ${MODEL_SIZE_KEY}" +echo "==========================+" + +sourceFile "${ALCF_DIR}/model.sh" + +MODEL_TYPE=${MODEL_TYPE:-gpt} + +# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ Model Parallel / Pipeline Parallel ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +# ---------- +# Originals +# MPSIZE=8 +# PPSIZE=16 +# ---------- +# NHOSTS=$(wc -l < "${PBS_NODEFILE}") +export DDP_IMPL="local" # FSDP | local | torch +export USE_FLASH_ATTN=${USE_FLASH_ATTN:-0} # 1 | 0 +export USE_ACTIVATION_CHECKPOINTING=1 # 1 | 0 +export SEQ_LEN=${SEQ_LEN:-2048} +export PPSIZE=${PPSIZE:-1} +# export MPSIZE=${MPSIZE:-1} +# export SPSIZE=${SPSIZE:-1} +export MICRO_BATCH=${MICRO_BATCH:-1} +# export ZERO_STAGE=${ZERO_STAGE:-1} # 0 | 1 | 2 | 3 +export GRADIENT_ACCUMULATION_STEPS=${GAS:-1} +# export NHOSTS="$NHOSTS" +# export USE_SEQUENCE_PARALLEL=${USE_SEQUENCE_PARALLEL:-0} # 1 | 0 + +# NHOSTS=$(wc -l < "${COBALT_NODEFILE}") +# # NGPU_PER_HOST=8 +# NGPU_PER_HOST=$(nvidia-smi -L | wc -l) +# PARALLEL_SIZE=$((${NHOSTS}*${NGPU_PER_HOST})) + +export MODEL_TYPE=${MODEL_TYPE:-"gpt"} # set bert or gpt +export SP_TYPE=${SP_TYPE:-"megatron"} # set ds or megatron + + +# Deal with Sequence Parallel implementation --------------------------------------- +# ---------------------------------------------------------------------------------- +if [[ ${SP_TYPE} == "ds" ]]; then + # NOTE: -------------------------------------------------------------------------- + # SP_TYPE="ds" has NO effect, essentially running with no Seq. || + # -------------------------------------------------------------------------------- + USE_SEQUENCE_PARALLEL=0 + if [[ "$MPSIZE" == "${WORLD_SIZE}" ]]; then + echo "Caught MPSIZE: $MPSIZE from env. Setting SPSIZE=1" + SPSIZE=1 + MPSIZE="${MPSIZE}" + else + echo "Didn't catch MPSIZE from env. Setting SPSIZE=${WORLD_SIZE}, MPSIZE=1" + MPSIZE=1 + SPSIZE="${WORLD_SIZE}" + fi + # if [[ "$MPSIZE" != 0 ]]; then + # SPSIZE=$(( WORLD_SIZE - MPSIZE )) + # echo "############################################################" + # echo "Caught MPSIZE: $MPSIZE from env!" + # echo "Setting SPSIZE: (${WORLD_SIZE} - ${MPSIZE}) = ${SPSIZE}" + # echo "############################################################" + # else + # MPSIZE=1 + # SPSIZE="$WORLD_SIZE" + # echo "############################################################" + # echo "Setting MPSIZE: $SPSIZE, SPSIZE: $WORLD_SIZE = $SPSIZE" + # echo "############################################################" + # fi + # [ "$MPSIZE" ] && SPSIZE=1 || SPSIZE="${WORLD_SIZE}" + # [ "$SPSIZE" ] && MPSIZE=1 || MPSIZE="${}" + # [ "$MPSIZE" = "$WORLD_SIZE" ] && SPSIZE=1 || SPSIZE="$WORLD_SIZE" + # [ "$SPSIZE" = "$WORLD_SIZE" ] && MPSIZE=1 || MPSIZE="${WORLD_SIZE}" + # export SPSIZE="${WORLD_SIZE}" + export SPSIZE="${SPSIZE:-$WORLD_SIZE}" + export MPSIZE="${MPSIZE:-1}" + export USE_SEQUENCE_PARALLEL=0 + if [ -z "${ZERO_STAGE}" ]; then + echo "ZERO_STAGE not set, setting to 3 for ${SP_TYPE}" + ZERO_STAGE=3 + else + echo "Caught ZERO_STAGE=${ZERO_STAGE} with ${SP_TYPE}" + fi + export ZERO_STAGE="${ZERO_STAGE}" +elif [[ ${SP_TYPE} == "megatron" ]]; then + # NOTE: -------------------------------------------------------------------------- + # SP_TYPE="megatron" will use Megatron's Seq. || implementation with ZERO_STAGE=0 + # -------------------------------------------------------------------------------- + # export SPSIZE=1 + # export MPSIZE="${WORLD_SIZE}" + [ "$SPSIZE" ] && echo "Caught SPSIZE: ${SPSIZE} from env" || SPSIZE=1 + [ "$MPSIZE" ] && echo "Caught MPSIZE: ${MPSIZE} from env" || MPSIZE="${WORLD_SIZE}" + [ "$ZERO_STAGE" ] && echo "Caught ${ZERO_STAGE} from env" || ZERO_STAGE=0 + # [ "$USE_SEQUENCE_PARALLEL" = 0 ] && export USE_SEQUENCE_PARALLEL=0 || export USE_SEQUENCE_PARALLEL=1 + # if [[ "$SPSIZE" == 0 ]]; then + # echo "Caught SPSIZE=$SPSIZE from env!!" + # USE_SEQUENCE_PARALLEL=0 + # else + # USE_SEQUENCE_PARALLEL=1 + # fi + # if [[ "$USE_SEQUENCE_PARALLEL" ]]; then + # echo "Caught USE_SEQUENCE_PARALLEL=${USE_SEQUENCE_PARALLEL} from env!!" + # [ "${SPSIZE}" != 0 ] && USE_SEQUENCE_PARALLEL=1 || USE_SEQUENCE_PARALLEL=0 + # [ "$USE_SEQUENCE_PARALLEL" = 0 ] && echo "Not using sequence parallelism" || USE_SEQUENCE_PARALLEL=1 + export SPSIZE="${SPSIZE}" + export MPSIZE="${MPSIZE}" + export ZERO_STAGE="${ZERO_STAGE}" + export USE_SEQUENCE_PARALLEL="${USE_SEQUENCE_PARALLEL:-1}" + # if [[ "${SPSIZE}" == 0 ]]; then + # [ "$SPSIZE" ] && USE_SEQUENCE_PARALLEL=1 || USE_SEQUENCE_PARALLEL=0 + # echo "Caught SPSIZE=${SPSIZE} from env, with ${SP_TYPE} sequence parallelism" + # export SPSIZE="${SPSIZE}" + # export USE_SEQUENCE_PARALLEL=0 + # else + # export SPSIZE=1 + # export USE_SEQUENCE_PARALLEL=1 + # fi + # if [ -z "${ZERO_STAGE}" ]; then + # echo "ZERO_STAGE not set, setting to 0 for ${SP_TYPE}" + # ZERO_STAGE=0 + # else + # echo "Caught ZERO_STAGE=${ZERO_STAGE} with ${SP_TYPE}" + # fi +else + echo "Unexpected SP_TYPE: ${SP_TYPE}" + exit 1 +fi +# ------------------------------------------------------------------------ + +echo "####################################################" +echo "# USING: ${SP_TYPE}" +echo "# SPSIZE: ${SPSIZE}" +echo "# PPSIZE: ${SPSIZE}" +echo "# MPSIZE: ${MPSIZE}" +echo "# ZERO_STAGE: ${ZERO_STAGE}" +echo "# WORLD_SIZE: ${WORLD_SIZE}" +echo "# USE_SEQUENCE_PARALLEL: ${USE_SEQUENCE_PARALLEL}" +echo "####################################################" + +echo "########################################################" +echo "| ${SP_TYPE} sequence parallelism, with: " +echo "| {MPSIZE: ${MPSIZE}, SPSIZE: ${SPSIZE}, USE_SEQUENCE_PARALLEL: ${USE_SEQUENCE_PARALLEL}} !!" +echo "########################################################" + +# GLOBAL_BATCH=1 +# GLOBAL_BATCH=$(( $GLOBAL_BATCH / $MPSIZE / $PPSIZE / $SPSIZE )) +# GLOBAL_BATCH=$(( $NGPUS * $MICRO_BATCH * $GRADIENT_ACCUMULATION_STEPS )) +GLOBAL_BATCH=$(( NGPUS * MICRO_BATCH * GRADIENT_ACCUMULATION_STEPS )) +echo "GB = NGPUS * MB * GAS = ${NGPUS} * ${MICRO_BATCH} * ${GRADIENT_ACCUMULATION_STEPS} = ${GLOBAL_BATCH}" + +GLOBAL_BATCH=$(( GLOBAL_BATCH / MPSIZE / PPSIZE / SPSIZE)) +echo "GB = (NGPUS * MB * GAS) / (MP * PP * SP) = (${NGPUS} * ${MICRO_BATCH} * ${GRADIENT_ACCUMULATION_STEPS}) / (${MPSIZE} * ${PPSIZE} * ${SPSIZE}) = ${GLOBAL_BATCH}" +export GLOBAL_BATCH="$GLOBAL_BATCH" + +echo "--------------------------------" +echo "GLOBAL_BATCH=${GLOBAL_BATCH}" +echo "--------------------------------" + +# ┏━━━━━━━━━━━━┓ +# ┃ Data paths ┃ +# ┗━━━━━━━━━━━━┛ +# DATA_PATH=/lus/grand/projects/datascience/vsastry/genslm_subsample_200k_sequence_document/genslm_subsample_200k_sequence_document +DATA_DIR="${PARENT}/dataset" +DATA_PATH="${DATA_DIR}/BookCorpusDataset_text_document" +VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" +MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" + +# DATA_PATH="/home/czh5/genome/Megatron-DeepSpeed/dataset/BookCorpusDataset_text_document" +# VOCAB_FILE="/home/czh5/genome/Megatron-DeepSpeed/dataset/gpt2-vocab.json" +# MERGE_FILE="/home/czh5/genome/Megatron-DeepSpeed/dataset/gpt2-merges.txt" +# DATA_PATH="/lus/eagle/projects/MDClimSim/chengming/gpt_datasets1/BookCorpusDataset_text_document" +# VOCAB_FILE="/lus/eagle/projects/MDClimSim/chengming/gpt_datasets1/gpt2-vocab.json" +# MERGE_FILE="/lus/eagle/projects/MDClimSim/chengming/gpt_datasets1/gpt2-merges.txt" + +# ┏━━━━━━━━━━━━━━━━━━━┓ +# ┃ FILE I/O SETTINGS ┃ +# ┗━━━━━━━━━━━━━━━━━━━┛ +RUN_STR="gb${GLOBAL_BATCH}_mb${MICRO_BATCH}" +RUN_STR="nl${NLAYERS}_hs${HIDDEN}_${RUN_STR}" +RUN_STR="mp${MPSIZE}_pp${PPSIZE}_sp${SPSIZE}_${RUN_STR}" +RUN_STR="z${ZERO_STAGE}_seqlen${SEQ_LEN}_${RUN_STR}" +RUN_STR="${MODEL_SIZE}_${RUN_STR}" + +if [[ $USE_FLASH_ATTN == 1 ]] ; then + RUN_STR="flashAttn_${RUN_STR}" +fi +if [[ $DDP_IMPL == 'FSDP' ]]; then + RUN_STR="FSDP_${RUN_STR}" +fi +if [[ $USE_ACTIVATION_CHECKPOINTING == 1 ]] ;then + RUN_STR="actCkpt_${RUN_STR}" +fi +if [[ $USE_SEQUENCE_PARALLEL == 1 ]] ; then + RUN_STR="SP_${RUN_STR}" +fi + +RUN_STR="${MODEL_TYPE}_${RUN_STR}" + +OUTPUT_DIR="${PARENT}/outputs/${RUN_STR}" +CHECKPOINT_DIR="${PARENT}/checkpoints/$RUN_STR" +TENSORBOARD_DIR="${PARENT}/outputs/${RUN_STR}/tensorboard" + +export MODEL_SIZE="$MODEL_SIZE" +export TENSORBOARD_DIR=$TENSORBOARD_DIR +export OUTPUT_DIR=$OUTPUT_DIR +mkdir -p "$OUTPUT_DIR/tensorboard/wandb" +mkdir -p "$CHECKPOINT_DIR" +mkdir -p "$TENSORBOARD_DIR" +mkdir -p "$OUTPUT_DIR" +echo "OUTPUT TO: ${OUTPUT_DIR}" + +# if [[ -z "${NVME_PATH}" ]]; then +# echo "NVME_PATH: $NVME_PATH" +# else +# if [[ $(hostname) == x* ]]; then +# export NVME_PATH="/local/scratch/" +# elif [[ $(hostname) == theta* ]]; then +# export NVME_PATH="/raid/scratch/" +# else +# export NVME_PATH="/tmp/" +# fi +# fi + +# echo "NVME_PATH: ${NVME_PATH}" + +if [[ $MODEL_TYPE == "gpt" ]] ; then + DATA_LOAD_ARGS="--data-path $DATA_PATH --vocab-file $VOCAB_FILE --merge-file $MERGE_FILE" +else + DATA_LOAD_ARGS="" +fi + +# Set to cpu for offloading to cpu for larger models +OFFLOAD_DEVICE="${OFFLOAD_DEVICE:-cpu}" +CPU_OPTIM=" --cpu-optimizer" + +# # Set to none and empty string for no cpu offloading +# OFFLOAD_DEVICE="none" +# CPU_OPTIM=" " + +# ┏━━━━━━━━━━━━━━━━━━┓ +# ┃ DeepSpeed Config ┃ +# ┗━━━━━━━━━━━━━━━━━━┛ +DS_CONFIG=${PARENT}/ds_config-gpt.json +echo "!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~!" +echo "! DS_CONFIG: ${DS_CONFIG}" +echo "!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~!" +# "optimizer": { +# "type": "Adam", +# "params": { +# "lr": 0.001, +# "betas": [0.8, 0.999], +# "eps": 1e-8, +# "weight_decay": 3e-7 +# } +# }, + +# "zero_allow_untested_optimizer": false, +# "train_batch_size" : $GLOBAL_BATCH, +# "zero_force_ds_cpu_optimizer": false, +if [[ $ZERO_STAGE == "3" ]] ; then +cat < "$DS_CONFIG" +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH, + "steps_per_print": 1, + "wall_clock_breakdown" : true, + "gradient_accumulation_steps": $GRADIENT_ACCUMULATION_STEPS, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 3, + "stage3_max_live_parameters": 3e9, + "stage3_max_reuse_distance": 3e9, + "stage3_param_persistence_threshold": 1e5, + "stage3_prefetch_bucket_size": 1e9, + "contiguous_gradients": true, + "overlap_comm": true, + "reduce_bucket_size": 90000000, + "sub_group_size": 5e7, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "offload_optimizer": { + "device": "cpu", + "buffer_count": 4, + "pipeline_read": false, + "pipeline_write": false, + "pin_memory": true + } + }, + "fp16": { + "enabled": true, + "initial_scale_power" : 12, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "aio": { + "block_size": 1048576, + "queue_depth": 16, + "single_submit": false, + "overlap_events": true, + "thread_count": 2 + }, + "flops_profiler": { + "enabled": true, + "profile_step": 1, + "module_depth": -1, + "top_modules": 3, + "detailed": true, + "output_file": null + }, + "comms_logger": { + "enabled": true, + "verbose": false, + "prof_all": false, + "debug": false + }, + "wandb": { + "enabled": true, + "project": "Megatron-DS-Benchmarking" + } +} +EOT +else +cat < "$DS_CONFIG" +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH, + "gradient_accumulation_steps": $GRADIENT_ACCUMULATION_STEPS, + "steps_per_print": 1, + "wall_clock_breakdown" : true, + "zero_force_ds_cpu_optimizer": false, + "zero_optimization": { + "stage": $ZERO_STAGE, + "allgather_partitions": true, + "reduce_scatter": true, + "allgather_bucket_size": 5e8, + "overlap_comm": true, + "contiguous_gradients": true, + "offload_optimizer": { + "device": "cpu" + } + }, + "optimizer": { + "type": "OneBitAdam" + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 0.001, + "warmup_num_steps": 1000 + } + }, + "fp16": { + "enabled": true, + "initial_scale_power": 12 + }, + "flops_profiler": { + "enabled": true, + "profile_step": 1, + "module_depth": -1, + "top_modules": 3, + "detailed": true, + "output_file": null + }, + "comms_logger": { + "enabled": true, + "verbose": false, + "prof_all": false, + "debug": false + }, + "wandb": { + "enabled": true, + "project": "Megatron-DS-Benchmarking" + } +} +EOT +fi +# "optimizer": { +# "type": "Adam", +# "params": { +# "lr": 0.001, +# "betas": [0.8, 0.999], +# "eps": 1e-8, +# "weight_decay": 3e-7 +# } +# }, +# +# "offload_optimizer": { +# "device": "$OFFLOAD_DEVICE", +# "buffer_count": 4, +# "pipeline_read": false, +# "pipeline_write": false, +# "pin_memory": true +# } +# "train_batch_size" : $GLOBAL_BATCH, +# 'offload_optimizer': 'cpu' + # "train_batch_size" : $GLOBAL_BATCH, +# "offload_optimizer": { +# "device": "cpu", +# "nvme_path": "/raid/scratch/" +# } +# +# "optimizer": { +# "type": "AdamW", +# "params": { +# "lr": 0.001, +# "betas": [0.8, 0.999], +# "eps": 1e-8, +# "weight_decay": 3e-7 +# } +# }, +# "optimizer": { +# "type": "OneBitAdam", +# "params": { +# "lr": 0.001, +# "betas": [ +# 0.8, +# 0.999 +# ], +# "eps": 1e-8, +# "weight_decay": 3e-7, +# "freeze_step": 400, +# "cuda_aware": false, +# "comm_backend_name": "nccl" +# } +# }, +# +# "optimizer": "Adam", +# "optimizer": { +# "type": "OneBitAdam", +# "params": { +# "lr": 0.001, +# "betas": [ +# 0.8, +# 0.999 +# ], +# "eps": 1e-8, +# "weight_decay": 3e-7, +# "freeze_step": 400, +# "cuda_aware": true, +# "comm_backend_name": "nccl" +# } +# }, +# +# +# 'deepspeed_mpi': True, +# 'ds_pipeline_enabled': False, +# 'rank': 0, +# 'world_size': 1, +# 'transformer_pipeline_model_parallel_size': 1, +# 'data_parallel_size': 1, +# 'virtual_pipeline_model_parallel_ size': None, + + +# ┏━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ DeepSpeed Arguments ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━┛ +if [[ "$DDP_IMPL" != "FSDP" ]] ; then + ds_args="" + ds_args=" --deepspeed ${ds_args}" + ds_args=" --deepspeed_mpi ${ds_args}" + ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" + ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" + if [[ "$PPSIZE" == 1 ]]; then + ds_args="--no-pipeline-parallel ${ds_args}" + else + ds_args=" --pipeline-model-parallel-size ${PPSIZE} ${ds_args}" + fi + if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + ds_args=" --deepspeed-activation-checkpointing ${ds_args}" + fi +fi + +# ┏━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ MEGATRON-LM SETTINGS ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━┛ +# "--sequence-parallel-size ${SPSIZE}" +gpt_args=( + "--seed ${RANDOM}" + "--DDP-impl ${DDP_IMPL}" + "--pipeline-model-parallel-size ${PPSIZE}" + "--tensor-model-parallel-size ${MPSIZE}" + "--num-layers ${NLAYERS}" + "--hidden-size ${HIDDEN}" + "--num-attention-heads ${ATEN_HEADS}" + "--micro-batch-size ${MICRO_BATCH}" + "--global-batch-size ${GLOBAL_BATCH}" + "--seq-length ${SEQ_LEN}" + "--max-position-embeddings ${SEQ_LEN}" + "--train-iters 10" + "--lr-decay-iters 320000" + "--num-workers 1" + "$DATA_LOAD_ARGS" + "--data-impl mmap" + "--split 949,50,1" + "--distributed-backend nccl" + "--lr 0.00015" + "--lr-decay-style cosine" + "--min-lr 1.0e-5" + "--weight-decay 1e-2" + "--clip-grad 1.0" + "--lr-warmup-fraction .01" + "--log-interval 1" + "--save-interval 1000" + "--eval-interval 1000" + "--eval-iters 10" + "--override-opt_param-scheduler" + "--tensorboard-dir ${TENSORBOARD_DIR}" + "--log-timers-to-tensorboard" + "--tensorboard-log-interval 1" +) + +# --recompute-activations \ +# --recompute-granularity full \ +# --recompute-method uniform \ +# --recompute-num-layers 1 \ +if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + gpt_args+=( + "--checkpoint-activations" + "--checkpoint-num-layers 1" + ) +fi + +if [[ "$DDP_IMPL" != "FSDP" ]] ; then + gpt_args+=( + # "${gpt_args[*]}" + "--fp16" + ) +else + gpt_args+=( + "--bf16" + ) +fi + +if [[ "$USE_FLASH_ATTN" == 1 ]] ; then + gpt_args+=( + "--use-flash-attn" + ) +fi + +# if [[ "$USE_SEQUENCE_PARALLEL" == 1 ]]; then +# gpt_args+=( +# "--sequence-parallel" +# ) +# fi + +if [[ "${SP_TYPE}" == "ds" ]]; then + gpt_args+=( + "--cpu-optimizer" + ) +fi + +export gpt_args=( + "${gpt_args[*]}" + "${ds_args[*]}" +) +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "gpt_args: ${gpt_args[*]}" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" diff --git a/ALCF/benchmark_train.sh b/ALCF/benchmark_train.sh new file mode 100755 index 0000000000..94eb94e116 --- /dev/null +++ b/ALCF/benchmark_train.sh @@ -0,0 +1,82 @@ +#!/bin/bash --login + +TSTAMP=$(date "+%Y-%m-%d-%H%M%S") +# DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd -LP) +# + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ Make sure we're not already running; if so, exit here ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +PIDS=$(ps aux | grep pretrain_gpt.py | grep -v grep | awk '{print $2}') +if [ -n "${PIDS}" ]; then + echo "Already running! Exiting!" + exit 1 +fi + + +SOURCE=${BASH_SOURCE[0]} +while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink + DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) + SOURCE=$(readlink "$SOURCE") + [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +done +DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) + + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + +SETUP_FILE="${DIR}/setup.sh" +MODEL_FILE="${DIR}/model.sh" +ARGS_FILE="${DIR}/args.sh" +LAUNCH_FILE="${DIR}/launch.sh" + + +sourceFile "${SETUP_FILE}" +sourceFile "${MODEL_FILE}" +sourceFile "${ARGS_FILE}" +sourceFile "${LAUNCH_FILE}" + + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +echo "My current script is: ${SCRIPT_DIR[0]}" + +NHOSTS=$(wc -l < "${COBALT_NODEFILE}") +# NGPU_PER_HOST=8 +NGPU_PER_HOST=$(nvidia-smi -L | wc -l) +PARALLEL_SIZE=$(( NHOSTS * NGPU_PER_HOST )) + +export MODEL_TYPE=${MODEL_TYPE:-"gpt"} # set bert or gpt +export SP_TYPE=${SP_TYPE:-"megatron"} # set ds or megatron + +echo "+-----------------------------+" +echo "| MODEL TYPE: ${MODEL_TYPE}" +echo "| SP TYPE: ${SP_TYPE}" +echo "+-----------------------------+" + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ source ./launch.sh ┃ +#┃ which then sources ./{args.sh,setup.sh} ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ + +export MODEL_TYPE=${MODEL_TYPE:-gpt} + +setup +# singleGPU "$@" 2>&1 & +# fullNode "$@" 2>&1 & +TORCH_VERSION=$(python3 -c 'import torch; print(torch.__version__)') +export TORCH_VERSION=$TORCH_VERSION +export CUDA_DEVICE_MAX_CONNECTIONS=1 +# fullNode "$@" +# elasticDistributed "$@" 2>&1 & +# elasticDistributed "$@" +PID=$! +wait $PID diff --git a/ALCF/launch.sh b/ALCF/launch.sh new file mode 100755 index 0000000000..7604098667 --- /dev/null +++ b/ALCF/launch.sh @@ -0,0 +1,151 @@ +#!/bin/bash --login + +HOST=$(hostname) + +# # DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd -LP) +# SOURCE=${BASH_SOURCE[0]} +# while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# SOURCE=$(readlink "$SOURCE") +# [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +# done + +# HERE=$(python3 -c 'import os; print(os.getcwd())') +# ALCF_DIR="${HERE}/ALCF" +# +ALCF_DIR="$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1)))/ALCF" +PARENT=$(dirname "${ALCF_DIR}") + +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "PARENT: ${PARENT}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source=./setup.sh + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# PARENT=$(dirname "${DIR}") + +MASTER_ADDR=$(uname -n) +MASTER_PORT=20010 +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +MPI_WRAPPER="${SCRIPT_DIR}/mpi_wrapper" + +sourceFile "${ALCF_DIR}/args.sh" + +MAIN="${PARENT}/pretrain_${MODEL_TYPE}.py" + +printJobInfo() { + echo "Job started at: ${TSTAMP} on $(hostname)" + echo "Job running in: ${DIR}" + echo "Training GPT-3 with ${MODEL_SIZE} parameters" + echo "Writing logs to: ${OUTPUT_DIR}" + echo 'to view output: tail -f $(tail -1 logfiles)' + echo "i.e. tail -f $(tail -1 "${PARENT}"/logfiles)" +} + +launchJob() { + echo "using: $(which python3)" | tee -a "${OUTPUT_LOG}" + printJobInfo | tee -a "${OUTPUT_LOG}" + echo EXEC="${EXEC}" | tee -a "${OUTPUT_LOG}" + echo "Writing logs to: ${OUTPUT_LOG}" | tee -a "${OUTPUT_LOG}" + ${EXEC} "$@" # >> "${OUTPUT_LOG}" 2>&1 & +} + +singleGPU() { + echo "\ + Running on 1 host \ + with 1 GPUs each \ + for a total of 1 GPUs" + EXEC="\ + $(which python3) \ + ${MAIN} \ + ${gpt_args} \ + ${ds_args}" + OUTPUT_LOG="${OUTPUT_DIR}/logs/$USER-$HOST-nhosts1-ngpu1-$TSTAMP.log" + mkdir -p "$(dirname "${OUTPUT_LOG}")" + echo "${OUTPUT_LOG}" >> "${PARENT}/logfiles" + printJobInfo | tee -a "${OUTPUT_LOG}" + launchJob "$@" >> "${OUTPUT_LOG}" 2>&1 & +} + +# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ Use all available GPUs a single nodes ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +fullNode() { + echo "fullNode started" + echo "MPI_COMMAND ${MPI_COMMAND}" + echo "MPI_DEFAULTS ${MPI_DEFAULTS}" + echo "NGPUS ${NGPUS}" + echo "hostfile ${DIR}/hostfile" + echo "MAIN ${MAIN}" + echo "gpt_args ${gpt_args}" + NHOSTS=$(wc -l < "${HOSTFILE}") + NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + # NGPU_PER_HOST=1 + NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) + # hostname > $DIR/hostfile + echo "\ + Running on $NHOSTS hosts \ + with $NGPU_PER_HOST GPUs each \ + for a total of $NGPUS GPUs" + EXEC="\ + ${MPI_COMMAND} \ + ${MPI_DEFAULTS} \ + "${MPI_ELASTIC}" + ${MPI_WRAPPER} ${MASTER_ADDR} ${MASTER_PORT} \ + ${MAIN} \ + ${gpt_args} \ + ${ds_args}" + OUTPUT_LOG="${OUTPUT_DIR}/logs/$USER-$HOST-nhosts${NHOSTS}-ngpu${NGPUS}-$TSTAMP.log" + mkdir -p "$(dirname "${OUTPUT_LOG}")" + echo "${OUTPUT_LOG}" >> "${PARENT}/logfiles" + printJobInfo | tee -a "${OUTPUT_LOG}" + launchJob "$@" 2>&1 | tee "${OUTPUT_LOG}" +} + +# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ Use all available GPUs on all available nodes ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +elasticDistributed() { + NHOSTS=$(wc -l < "${HOSTFILE}") + NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) + export WORLD_SIZE="${NGPUS}" + echo "\ + Running on ${NHOSTS} hosts \ + with ${NGPU_PER_HOST} GPUs each \ + for a total of ${NGPUS} GPUs" + EXEC_STR=( + "${MPI_COMMAND}" + "${MPI_DEFAULTS}" + "${MPI_ELASTIC}" + "$(which python3)" + "${MAIN}" + "${gpt_args}" + "${ds_args}" + ) + EXEC="${EXEC_STR[*]}" + OUTPUT_LOG="${OUTPUT_DIR}/logs/$USER-$HOST-nhosts${NHOSTS}-ngpu${NGPUS}-$TSTAMP.log" + echo "Writing logs to: ${OUTPUT_LOG}" + mkdir -p "$(dirname "${OUTPUT_LOG}")" + echo "${OUTPUT_LOG}" >> "${PARENT}/logfiles" + printJobInfo | tee -a "${OUTPUT_LOG}" + # launchJob "$@" >> "${OUTPUT_LOG}" 2>&1 & + # launchJob "$@" + # printJobInfo | tee -a "${OUTPUT_LOG}" + # launchJob "$@" >> "${OUTPUT_LOG}" 2>&1 & + launchJob "$@" >> "${OUTPUT_LOG}" 2>&1 & + PID=$! + wait $PID +} diff --git a/ALCF/model.sh b/ALCF/model.sh new file mode 100755 index 0000000000..2427893e9b --- /dev/null +++ b/ALCF/model.sh @@ -0,0 +1,264 @@ +#!/bin/bash --login +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ GPT MODEL SETTINGS ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ Model / Architecture settings ┃ +# ┃ ---------------------------------------------------- ┃ +# ┃ GPT-3 models use 2K sequence length/context window ┃ +# ┃ The "GPT-3 XXX" below are configs from GPT-3 paper ┃ +# ┃ https://arxiv.org/abs/2005.14165, choose based on ┃ +# ┃ your desired model size or build your own configs ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ + +declare -A A_NLAYERS +declare -A A_HIDDEN +declare -A A_ATEN_HEADS + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3 Small: 125M ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +MODEL_125M_KEY="GPT125M" +A_NLAYERS[$MODEL_125M_KEY]=12 +A_HIDDEN[$MODEL_125M_KEY]=768 +A_ATEN_HEADS[$MODEL_125M_KEY]=16 + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ BERT: 1.2B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="BERT1.2B" +# NLAYERS=24 +# HIDDEN=2048 +# ATEN_HEADS=128 + +BERT_1_2B_KEY="BERT1.2B" +A_NLAYERS[$BERT_1_2B_KEY]=24 +A_HIDDEN[$BERT_1_2B_KEY]=2048 +A_ATEN_HEADS[$BERT_1_2B_KEY]=128 + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 1.5B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="1.5B" +# NLAYERS=48 +# HIDDEN=1536 +# ATEN_HEADS=24 + +MODEL_1_5B_KEY="GPT1_5B" +A_NLAYERS[$MODEL_1_5B_KEY]=48 +A_HIDDEN[$MODEL_1_5B_KEY]=1536 +A_ATEN_HEADS[$MODEL_1_5B_KEY]=24 + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 1.5B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="1.5B" +# NLAYERS=48 +# HIDDEN=1600 +# ATEN_HEADS=25 + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 2.7B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="2.7B" +# NLAYERS=32 +# HIDDEN=2560 +# ATEN_HEADS=32 + +MODEL_2_7B_KEY="GPT2_7B" +A_NLAYERS[$MODEL_2_7B_KEY]=32 +A_HIDDEN[$MODEL_2_7B_KEY]=2560 +A_ATEN_HEADS[$MODEL_2_7B_KEY]=32 + +# ┏━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ ✓ GPT-3: 6.7B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="6.7B" +# NLAYERS=32 +# HIDDEN=4096 +# ATEN_HEADS=32 + +MODEL_6_7B_KEY="GPT6_7B" +A_NLAYERS[$MODEL_6_7B_KEY]=32 +A_HIDDEN[$MODEL_6_7B_KEY]=4096 +A_ATEN_HEADS[$MODEL_6_7B_KEY]=32 + +# ┏━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ ✓ GPT-3: 13B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="13B" +# NLAYERS=40 +# HIDDEN=5120 +# ATEN_HEADS=40 + +MODEL_13B_KEY="GPT13B" +A_NLAYERS[$MODEL_13B_KEY]=40 +A_HIDDEN[$MODEL_13B_KEY]=5120 +A_ATEN_HEADS[$MODEL_13B_KEY]=64 + +# ┏━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ ✓ GPT-3: 18.4B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="18.4B" +# NLAYERS=40 +# HIDDEN=6144 +# ATEN_HEADS=48 + +# ┏━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ ✓ GPT-3: 20B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="20B" +# NLAYERS=44 +# HIDDEN=6144 +# ATEN_HEADS=64 + +# ┏━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 25B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="25B" +# NLAYERS=64 +# ------------ +# HIDDEN=5760 # DEFAULT (no flash attn) +# ATEN_HEADS=64 +# ------------ +# HIDDEN=5888 # headdim = 5888 / 46 = 128 +# ATEN_HEADS=46 +# ----------------- +# -- FLASH ATTN -- +# headdim = 5760 / 80 = 72 +# HIDDEN=5760 +# ATEN_HEADS=80 +# ------------ + +MODEL_25B_KEY="GPT25B" +A_NLAYERS[$MODEL_25B_KEY]=64 +A_HIDDEN[$MODEL_25B_KEY]=6144 +A_ATEN_HEADS[$MODEL_25B_KEY]=64 + +# ┏━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 30B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="30B" +# NLAYERS=64 +# HIDDEN=6144 +# ATEN_HEADS=64 + +# head size must be divisible by 8 (requirements of flash attention) +# head num must be divisible by sequence/tensor parallel size +MODEL_30B_KEY="GPT30B" +A_NLAYERS[$MODEL_30B_KEY]=64 +A_HIDDEN[$MODEL_30B_KEY]=6144 +A_ATEN_HEADS[$MODEL_30B_KEY]=64 + +# ┏━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 33B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="33B" +# NLAYERS=80 +# HIDDEN=5760 +# ATEN_HEADS=80 + +# MODEL_33B_KEY="GPT33B" +# A_NLAYERS[$MODEL_33B_KEY]=80 +# A_HIDDEN[$MODEL_33B_KEY]=5760 +# A_ATEN_HEADS[$MODEL_33B_KEY]=80 + +MODEL_33B_KEY="GPT33B" +A_NLAYERS[$MODEL_33B_KEY]=80 +A_HIDDEN[$MODEL_33B_KEY]=6144 +A_ATEN_HEADS[$MODEL_33B_KEY]=64 + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 145B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="145B" +# NLAYERS=80 +# HIDDEN=12288 +# ATEN_HEADS=96 +# +GPT145B_HIDDEN=12288 +GPT145B_ATEN_HEADS=96 + +MODEL_145B_2L_KEY="GPT145B_2L" +A_NLAYERS[$MODEL_145B_2L_KEY]=2 +A_HIDDEN[$MODEL_145B_2L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_2L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_4L_KEY="GPT145B_4L" +A_NLAYERS[$MODEL_145B_4L_KEY]=4 +A_HIDDEN[$MODEL_145B_4L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_4L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_5L_KEY="GPT145B_5L" +A_NLAYERS[$MODEL_145B_5L_KEY]=5 +A_HIDDEN[$MODEL_145B_5L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_5L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_6L_KEY="GPT145B_6L" +A_NLAYERS[$MODEL_145B_6L_KEY]=6 +A_HIDDEN[$MODEL_145B_6L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_6L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_8L_KEY="GPT145B_8L" +A_NLAYERS[$MODEL_145B_8L_KEY]=8 +A_HIDDEN[$MODEL_145B_8L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_8L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_10L_KEY="GPT145B_10L" +A_NLAYERS[$MODEL_145B_10L_KEY]=10 +A_HIDDEN[$MODEL_145B_10L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_10L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_12L_KEY="GPT145B_12L" +A_NLAYERS[$MODEL_145B_12L_KEY]=12 +A_HIDDEN[$MODEL_145B_12L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_12L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_16L_KEY="GPT145B_16L" +A_NLAYERS[$MODEL_145B_16L_KEY]=16 +A_HIDDEN[$MODEL_145B_16L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_16L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_24L_KEY="GPT145B_24L" +A_NLAYERS[$MODEL_145B_24L_KEY]=24 +A_HIDDEN[$MODEL_145B_24L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_24L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_32L_KEY="GPT145B_32L" +A_NLAYERS[$MODEL_145B_32L_KEY]=32 +A_HIDDEN[$MODEL_145B_32L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_32L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_48L_KEY="GPT145B_48L" +A_NLAYERS[$MODEL_145B_48L_KEY]=48 +A_HIDDEN[$MODEL_145B_48L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_48L_KEY]="${GPT145B_ATEN_HEADS}" + +MODEL_145B_64L_KEY="GPT145B_64L" +A_NLAYERS[$MODEL_145B_64L_KEY]=64 +A_HIDDEN[$MODEL_145B_64L_KEY]="${GPT145B_HIDDEN}" +A_ATEN_HEADS[$MODEL_145B_64L_KEY]="${GPT145B_ATEN_HEADS}" + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ GPT-3: 175B Params ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE="175B" +# NLAYERS=96 +# HIDDEN=12288 +# ATEN_HEADS=96 +# if [ -z "$NLAYERS" ]; then +# A_NLAYERS[$MODEL_145B_KEY]="${NLAYERS}" +# echo "Caught NLAYERS=${NLAYERS} from env, using this value!" +# else +# A_NLAYERS[$MODEL_145B_KEY]=80 +# echo "Using default NLAYERS=80" +# fi +MODEL_145B_KEY="GPT145B" +A_NLAYERS[$MODEL_145B_KEY]=80 +A_HIDDEN[$MODEL_145B_KEY]=12288 +A_ATEN_HEADS[$MODEL_145B_KEY]=96 + +export MODEL_SIZE="${MODEL_SIZE_KEY}" +export NLAYERS="${A_NLAYERS[$MODEL_SIZE_KEY]}" +export HIDDEN="${A_HIDDEN[$MODEL_SIZE_KEY]}" +export ATEN_HEADS="${A_ATEN_HEADS[$MODEL_SIZE_KEY]}" diff --git a/ALCF/mpi_wrapper b/ALCF/mpi_wrapper new file mode 100755 index 0000000000..770f950e1d --- /dev/null +++ b/ALCF/mpi_wrapper @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +set -e + +PYTHON=${PYTHON:-python3} + +MASTER_ADDR="$1" +MASTER_PORT="$2" +TRAINING_SCRIPT="$3" + +shift 3 + +test -n "${MASTER_ADDR}" -a -n "${MASTER_PORT}" -a -n "${OMPI_COMM_WORLD_RANK}" -a -n "${OMPI_COMM_WORLD_SIZE}" -a -n "${OMPI_COMM_WORLD_LOCAL_RANK}" +test -f "${TRAINING_SCRIPT}" + +set -x + +LOCAL_RANK=$((OMPI_COMM_WORLD_RANK % 8)) + +exec env \ + MASTER_ADDR="${MASTER_ADDR}" \ + MASTER_PORT="${MASTER_PORT}" \ + RANK="${OMPI_COMM_WORLD_RANK}" \ + WORLD_SIZE="${OMPI_COMM_WORLD_SIZE}" \ + ${PYTHON} -u "${TRAINING_SCRIPT}" "--local_rank=${LOCAL_RANK}" "$@" + +exit 1 diff --git a/ALCF/setup.sh b/ALCF/setup.sh new file mode 100755 index 0000000000..0b90daf02e --- /dev/null +++ b/ALCF/setup.sh @@ -0,0 +1,228 @@ +#!/bin/bash --login +# +# DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd -LP) +SOURCE=${BASH_SOURCE[0]} +while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink + DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) + SOURCE=$(readlink "$SOURCE") + [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +done +DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +PARENT=$(dirname "${DIR}") + +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +thetagpuMPI() { + NHOSTS=$(wc -l < "${COBALT_NODEFILE}") + NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) + NVME_PATH="/raid/scratch/" + MPI_COMMAND=$(which mpirun) + # export PATH="${CONDA_PREFIX}/bin:${PATH}" + MPI_DEFAULTS="\ + --hostfile ${HOSTFILE} \ + -x CFLAGS \ + -x LDFLAGS \ + -x http_proxy \ + -x PYTHONUSERBASE \ + -x https_proxy \ + -x PATH \ + -x CUDA_DEVICE_MAX_CONNECTIONS \ + -x LD_LIBRARY_PATH" + MPI_ELASTIC="\ + -n ${NGPUS} \ + -npernode ${NGPU_PER_HOST}" +} + +polarisMPI() { + NHOSTS=$(wc -l < "${PBS_NODEFILE}") + NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) + MPI_COMMAND=$(which mpiexec) + NVME_PATH="/local/scratch/" + MPI_DEFAULTS="\ + --envall \ + --verbose \ + --hostfile ${HOSTFILE}" + MPI_ELASTIC="\ + -n ${NGPUS} \ + --ppn ${NGPU_PER_HOST}" +} + +setupMPI() { + if [[ $(hostname) == theta* ]]; then + echo "Setting up MPI on ThetaGPU from $(hostname)" + thetagpuMPI + elif [[ $(hostname) == x* ]]; then + echo "Setting up MPI on Polaris from $(hostname)" + polarisMPI + else + echo "Unexpected hostname $(hostname)" + fi +} + +condaThetaGPU220701() { + module load conda/2022-07-01 ; conda activate base + conda activate \ + /lus/grand/projects/datascience/foremans/locations/thetaGPU/miniconda3/envs/2022-07-01 + # if [[ -f "${PARENT}/.venvs/thetaGPU/2022-07-01-deepspeed/bin/activate" ]]; then + # echo "Found virtual environment!" + # source "${PARENT}/.venvs/thetaGPU/2022-07-01-deepspeed/bin/activate" + # fi +} + +condaThetaGPU230111() { + module load conda/2023-01-11 ; conda activate base +# conda activate \ +# /lus/grand/projects/datascience/foremans/locations/thetaGPU/miniconda3/envs/2023-01-11-deepspeed + VENV_DIR="${PARENT}/venvs/thetaGPU/2023-01-11-deepspeed" + if [[ -d "${VENV_DIR}" ]] ; then + echo "Found venv at: ${VENV_DIR}" + # shellcheck source='../venvs/thetaGPU/2023-01-10/bin/activate' + source "${VENV_DIR}/bin/activate" + fi +} + +condaThetaGPU() { + module load conda/2022-07-01 ; conda activate base + conda activate \ + /lus/grand/projects/datascience/foremans/locations/thetaGPU/miniconda3/envs/2022-07-01 + echo "USING PYTHON: $(which python3)" +} + +condaThetaGPU_mtanaka() { + # module load conda/2023-01-11 ; conda activate base + # conda activate \ + # /lus/grand/projects/datascience/foremans/locations/thetaGPU/miniconda3/envs/2023-01-11-deepspeed + VENV_DIR="/lus/grand/projects/datascience/mtanaka/dsseq/venv/dsseq" + if [[ -d "${VENV_DIR}" ]] ; then + echo "Found venv at: ${VENV_DIR}" + # shellcheck source='../venvs/thetaGPU/2023-01-10/bin/activate' + source "${VENV_DIR}/bin/activate" + fi +} + +condaPolaris220908() { + echo "Loading: 'module load conda 2022-09-08 ; conda activate base'" + module load conda/2022-09-08 ; conda activate base + conda activate /lus/grand/projects/datascience/foremans/locations/polaris/miniconda3/envs/2022-09-08-deepspeed + export CFLAGS="-I${CONDA_PREFIX}/include" + export LDFLAGS="-L${CONDA_PREFIX}/lib" + VENV_DIR="${PARENT}/venvs/polaris/2022-09-08" + if [[ -d "${VENV_DIR}" ]]; then + echo "Found venv at: ${VENV_DIR}" + source "${VENV_DIR}/bin/activate" + fi +} + +condaPolaris230110() { + echo "Loading: 'module load conda 2023-01-10-unstable ; conda activate base'" + module load conda/2023-01-10-unstable ; conda activate base + export CFLAGS="-I${CONDA_PREFIX}/include" + export LDFLAGS="-L${CONDA_PREFIX}/lib" + # conda activate \ + # /lus/grand/projects/datascience/foremans/locations/polaris/miniconda3/envs/2023-01-10 + VENV_DIR="${PARENT}/venvs/polaris/2023-01-10/" + if [[ -d "${VENV_DIR}" ]]; then + echo "Found venv at: ${VENV_DIR}" + # shellcheck source=../venvs/polaris/2023-01-10/bin/activate + source "${VENV_DIR}/bin/activate" + fi +} + +condaThetaGPU230426() { + echo "Loading: 'module load conda 2023-01-10-unstable ; conda activate base'" + module load conda/2023-01-11 + conda activate base + conda activate /lus/grand/projects/datascience/foremans/locations/thetaGPU/miniconda3/envs/2023-04-26 + VENV_DIR="${PARENT}/venvs/thetaGPU/2023-04-26/" + if [[ -d "${VENV_DIR}" ]]; then + echo "Found venv at: ${VENV_DIR}" + # shellcheck source=../venvs/thetaGPU/2023-04-26/ + source "${VENV_DIR}/bin/activate" + fi + thetagpuMPI + export CFLAGS="-I${CONDA_PREFIX}/include" + export LDFLAGS="-L${CONDA_PREFIX}/lib" +} + +condaPolaris() { + condaPolaris230110 + echo "USING PYTHON: $(which python3)" +} + +# ┏━━━━━━━━━━┓ +# ┃ ThetaGPU ┃ +# ┗━━━━━━━━━━┛ +setupThetaGPU() { + if [[ $(hostname) == theta* ]]; then + export MACHINE="ThetaGPU" + HOSTFILE="${COBALT_NODEFILE}" + # -- Python / Conda setup ------------------------------------------------- + thetagpuMPI + condaThetaGPU230426 + else + echo "Unexpected hostname: $(hostname)" + fi +} + +# ┏━━━━━━━━━┓ +# ┃ Polaris ┃ +# ┗━━━━━━━━━┛ +setupPolaris() { + if [[ $(hostname) == x* ]]; then + export MACHINE="Polaris" + HOSTFILE="${PBS_NODEFILE}" + # -- MPI / Comms Setup ---------------------------------------------------- + condaPolaris + polarisMPI + # export IBV_FORK_SAFE=1 + else + echo "Unexpected hostname: $(hostname)" + fi +} + +# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +# ┃ SETUP CONDA + MPI ENVIRONMENT @ ALCF ┃ +# ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +setup() { + # unset PYTHONUSERBASE + export NCCL_DEBUG=warn + export WANDB_CACHE_DIR="./cache/wandb" + CFLAGS="-I${CONDA_PREFIX}/include/" + LDFLAGS="-L${CONDA_PREFIX}/lib/" + # export CFLAGS="${CFLAGS}" + # export LDFLAGS="${LDFLAGS}" + # export PATH="${CONDA_PREFIX}/bin:${PATH}" + + export NVME_PATH="${NVME_PATH}" + export MPI_DEFAULTS="${MPI_DEFAULTS}" + export MPI_ELASTIC="${MPI_ELASTIC}" + export MPI_COMMAND="${MPI_COMMAND}" + + PYTHON_EXECUTABLE="$(which python3)" + export PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" + echo "USING PYTHON: $(which python3)" + echo "CFLAGS: ${CFLAGS}" + echo "LDFLAGS: ${LDFLAGS}" + # source "${DIR}/args.sh" + + if [[ $(hostname) == theta* ]]; then + echo "Setting up ThetaGPU from $(hostname)" + setupThetaGPU + elif [[ $(hostname) == x* ]]; then + echo "Setting up Polaris from $(hostname)" + setupPolaris + else + echo "Unexpected hostname $(hostname)" + fi + export NODE_RANK=0 + # export RANK=0 + export NNODES=$NHOSTS + export GPUS_PER_NODE=$NGPU_PER_HOST + export WORLD_SIZE=$NGPUS + export NGPUS="${NGPUS}" + export NHOSTS="${NHOSTS}" + export NGPU_PER_HOST="${NGPU_PER_HOST}" + export CUDA_DEVICE_MAX_CONNECTIONS=1 +} diff --git a/ALCF/submit-pbs.sh b/ALCF/submit-pbs.sh new file mode 100755 index 0000000000..712a932656 --- /dev/null +++ b/ALCF/submit-pbs.sh @@ -0,0 +1,263 @@ +#!/bin/bash --login +# + +cd "${PBS_O_WORKDIR}" || exit + +# cd "${PBS_O_WORKDIR}" +# +# echo "PBS_O_WORKDIR: ${PBS_O_WORKDIR}" +# +# echo "__________________________________________________________________________________" +# cd ~/datascience/foremans/locations/polaris/projects/saforem2/Megatron-DS-Benchmarking/ +# echo "pwd: $(pwd)" +# echo "__________________________________________________________________________________" + +# SOURCE=${BASH_SOURCE[0]} +# while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# SOURCE=$(readlink "$SOURCE") +# [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +# done +# DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" + +# HERE=$(python3 -c 'import os; print(os.getcwd())') +# ALCF_DIR="${HERE}/ALCF" + +ALCF_DIR="$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1)))/ALCF" +PARENT=$(dirname "${ALCF_DIR}") +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "PARENT: ${PARENT}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + + +TSTAMP=$(tstamp) +echo "┌──────────────────────────────────────────────────────────────────┐" +#####"│ Job Started at 2023-08-04-121535 on polaris-login-04 by foremans │" +echo "│ Job Started at ${TSTAMP} on $(hostname) by $USER │" +echo "│ in: ${PARENT}" +echo "└──────────────────────────────────────────────────────────────────┘" +# echo "------------------------------------------------------------------------" + +getValFromFile() { + FILE=$1 + KEY=$2 + echo "getting ${KEY} from ${FILE}" + if [[ -f "${FILE}" ]]; then + VAL="$(cat "${FILE}" | grep -E "^${KEY}=" | sed "s/${KEY}=//g" | sed 's/\"//g')" + echo "setting ${KEY}: ${VAL}" + export "${KEY}"="${VAL}" + fi +} + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} +# +# sourceFile "${DIR}/setup.sh" +# sourceFile "${DIR}/model.sh" +# sourceFile "${DIR}/args.sh" +# sourceFile "${DIR}/launch.sh" +# +# export USE_ACTIVATION_CHECKPOINTING=1 # 1 | 0 +# export SEQ_LEN=${SEQ_LEN:-1024} +# export MPSIZE=${MPSIZE:-1} +# export PPSIZE=${PPSIZE:-1} +# export SPSIZE=${SPSIZE:-1} +# export MICRO_BATCH=${MICRO_BATCH:-1} +# export ZERO_STAGE=${ZERO_STAGE:-1} # 0 | 1 | 2 | 3 +# export NHOSTS="$NHOSTS" +# export GRADIENT_ACCUMULATION_STEPS=${GAS:-1} +# export USE_SEQUENCE_PARALLEL=${USE_SEQUENCE_PARALLEL:-0} # 1 | 0 +# +# +# export MODEL_SIZE_KEY="GPT1_5B" +# export SEQ_LEN=1024 +# export USE_FLASH_ATTN=1 +# export MICRO_BATCH=4 +# export WORLD_SIZE=8 +# export SP_TYPE="ds" +# export SPSIZE=8 +# export PPSIZE=1 +# export MPSIZE=1 +# export ZERO_STAGE=3 +# export USE_SEQUENCE_PARALLEL=0 + + +# getValFromFile "${DIR}/model.sh" MODEL_SIZE +# getValFromFile "${DIR}/args.sh" PPSIZE +# getValFromFile "${DIR}/args.sh" MPSIZE +# getValFromFile "${DIR}/args.sh" MICRO_BATCH +# getValFromFile "${DIR}/args.sh" GRADIENT_ACCUMULATION_STEPS +# +# MODEL_SIZE="${MODEL_SIZE}" +# PPSIZE="${PPSIZE}" +# MPSIZE="${MPSIZE}" +# MICRO_BATCH="${MICRO_BATCH}" +# GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS}" + +QUEUE=$1 +NUM_NODES=$2 +DURATION=$3 +PROJECT=$4 + +# MODEL_SIZE_KEY=$5 +# SEQ_LEN=$6 +# USE_FLASH_ATTN=$7 +# MICRO_BATCH=$8 +# GAS=$9 +# SP_TYPE=$10 + +# MODEL_SIZE_KEY="GPT6_7B" SEQ_LEN=2048 USE_FLASH_ATTN=0 MICRO_BATCH=1 GAS=1 SP_TYPE="deepspeed" ./ALCF/submit-pbs.sh debug-scaling 4 00:30:00 datascience + +# export MICRO_BATCH=${MICRO_BATCH:-1} +# export MICRO_BATCH="${MICRO_BATCH}" +# export MODEL_SIZE="${MODEL_SIZE}" +# # export GAS="${GRADIENT_ACCUMULATION_STEPS}" +# export GRADIENT_ACCUMULATION_STEPS=${GAS:-1} +# +# export DDP_IMPL="local" # FSDP | local | torch +# # export USE_FLASH_ATTN=${USE_FLASH_ATTN:-0} # 1 | 0 +# # export USE_ACTIVATION_CHECKPOINTING=1 # 1 | 0 +# export SEQ_LEN=${SEQ_LEN:-1024} +# # export MPSIZE=${MPSIZE:-1} +# export PPSIZE=${PPSIZE:-1} +# export SPSIZE=${SPSIZE:-1} +# export MICRO_BATCH=${MICRO_BATCH:-1} +# export ZERO_STAGE=${ZERO_STAGE:-1} # 0 | 1 | 2 | 3 +# # export NHOSTS="$NHOSTS" +# export GRADIENT_ACCUMULATION_STEPS=${GAS:-1} +# export USE_SEQUENCE_PARALLEL=${USE_SEQUENCE_PARALLEL:-0} # 1 | 0 +# + +if [ -z "${MODEL_SIZE_KEY}" ]; then + echo "ERROR: MODEL_SIZE_KEY not set" + exit 1 +fi + +if [ -z "${SEQ_LEN}" ]; then + echo "ERROR: SEQ_LEN not set" + echo "Using default SEQ_LEN=2048" + echo "Set SEQ_LEN=XXXX to change" + SEQ_LEN=2048 +fi + +if [ -z "${USE_FLASH_ATTN}" ]; then + echo "ERROR: USE_FLASH_ATTN not set" + echo "Not using flash attn! Set USE_FLASH_ATTN=1 to use" + USE_FLASH_ATTN=0 +fi + +if [ -z "${MICRO_BATCH}" ]; then + echo "ERROR: MICRO_BATCH not set" + echo "Using MICRO_BATCH=1" + MICRO_BATCH=1 +fi + +if [ -z "${GAS}" ]; then + echo "ERROR: GAS not set" + echo "Using GAS=1" + GAS=1 +fi + +if [ -z "${SP_TYPE}" ]; then + echo "ERROR: SP_TYPE not set" + echo "Using SP_TYPE=megatron" + SP_TYPE="megatron" +fi + +export GAS="${GAS}" +export SEQ_LEN="${SEQ_LEN}" +export SP_TYPE="${SP_TYPE}" +export MICRO_BATCH="${MICRO_BATCH}" +export MODEL_SIZE_KEY="${MODEL_SIZE_KEY}" +export USE_FLASH_ATTN="${USE_FLASH_ATTN}" + +echo "-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-" +echo "| MODEL_SIZE_KEY: ${MODEL_SIZE_KEY}" +echo "| SEQ_LEN: ${SEQ_LEN}" +echo "| USE_FLASH_ATTN: ${USE_FLASH_ATTN}" +echo "| MICRO_BATCH: ${MICRO_BATCH}" +echo "| GAS: ${GAS}" +echo "| SP_TYPE: ${SP_TYPE}" +echo "-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-" + +export QUEUE="${QUEUE}" +export DURATION="${DURATION}" +export TSTAMP="${TSTAMP}" +export NUM_NODES="${NUM_NODES}" +export PROJECT="${PROJECT}" + +RUN_NAME="N${NUM_NODES}-${TSTAMP}" +# RUN_NAME="mb${MICRO_BATCH}-gas${GAS}-${RUN_NAME}" +# RUN_NAME="GPT3-${MODEL_SIZE}-${RUN_NAME}" +RUN_NAME="${MODEL_SIZE_KEY}-${SP_TYPE}-mb${MICRO_BATCH}-gas${GAS}-seqlen${SEQ_LEN}-${RUN_NAME}" +export RUN_NAME="${RUN_NAME}" + +echo "QUEUE=$QUEUE" +echo "PROJECT=$PROJECT" +echo "DURATION=$DURATION" +echo "TSTAMP=$TSTAMP" +echo "NUM_NODES=$NUM_NODES" +echo "RUN_NAME: ${RUN_NAME}" +# echo "MODEL_SIZE=$MODEL_SIZE" +# echo "GAS=$GRADIENT_ACCUMULATION_STEPS" + +# QSUB_ARGS=( +# "-q ${QUEUE}" +# "-A ${PROJECT}" +# "-N ${RUN_NAME}" +# "-l select=${NUM_NODES}" +# "-l walltime=${DURATION}" +# "-l filesystems=eagle:home:grand" +# "${DIR}/submit.sh" +# ) + +OUTPUT=$(qsub \ + -q "${QUEUE}" \ + -A "${PROJECT}" \ + -N "${RUN_NAME}" \ + -l select="${NUM_NODES}" \ + -l walltime="${DURATION}" \ + -l filesystems=eagle:home:grand \ + "${ALCF_DIR}/submit.sh") + +# OUTPUT=$(qsub "${QSUB_ARGS[@]}") + +PBS_JOBID=$(echo "${OUTPUT}" | cut --delimiter="." --fields=1) +export PBS_JOBID="${PBS_JOBID}" +# echo "${TSTAMP} ${PBS_JOBID} " + +PBS_JOBSTR=( + "PBS_JOBID=${PBS_JOBID}" + "QUEUE=$QUEUE" + "PROJECT=$PROJECT" + "DURATION=$DURATION" + "TSTAMP=$TSTAMP" + "NUM_NODES=$NUM_NODES" + # "MODEL_SIZE=$MODEL_SIZE" + "RUN_NAME: ${RUN_NAME}" +) + # "GAS=$GRADIENT_ACCUMULATION_STEPS" + +TODAY=$(echo "${TSTAMP}" | cut --delimiter="-" --fields=1,2,3) +OUTFILE="${PARENT}/pbslogs/${TODAY}/${PBS_JOBID}.txt" + +if [[ ! -d $(dirname "${OUTFILE}") ]]; then + mkdir -p "$(dirname "${OUTFILE}")" +fi + +echo "Writing PBS_JOBSTR to ${OUTFILE}" +echo "${PBS_JOBSTR[@]}" >> "${OUTFILE}" +# echo "${PBS_JOBSTR[@]}" | tee -a "${OUTFILE}" + +echo "┌───────────────────────────────────────────┐" +echo "│ To view job output, run: \`pbstail ${PBS_JOBID}\` │" +echo "└───────────────────────────────────────────┘" diff --git a/ALCF/submit.sh b/ALCF/submit.sh new file mode 100755 index 0000000000..cc308b5be5 --- /dev/null +++ b/ALCF/submit.sh @@ -0,0 +1,66 @@ +#!/bin/bash --login +#PBS -V +# +cd "${PBS_O_WORKDIR}" || exit + +TSTAMP=$(date "+%Y-%m-%d-%H%M%S") +export TSTAMP="$TSTAMP" + +ALCF_DIR="$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1)))/ALCF" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + +# SOURCE=${BASH_SOURCE[0]} +# while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# SOURCE=$(readlink "$SOURCE") +# [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +# done +# DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ Make sure we're not already running; if so, exit here ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +PIDS=$(ps aux | grep pretrain_gpt.py | grep -v grep | awk '{print $2}') +if [ -n "${PIDS}" ]; then + echo "Already running! Exiting!" + exit 1 +fi + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ source ./launch.sh ┃ +#┃ which then sources ./{args.sh,setup.sh} ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +# SCRIPT_DIR="/lus/grand/projects/datascience/foremans/locations/polaris/projects/saforem2/Megatron-DS-Benchmarking/ALCF/" +MODEL_FILE="${ALCF_DIR}/model.sh" +ARGS_FILE="${ALCF_DIR}/args.sh" +LAUNCH_FILE="${ALCF_DIR}/launch.sh" +SETUP_FILE="${ALCF_DIR}/setup.sh" + +sourceFile "${SETUP_FILE}" +sourceFile "${ARGS_FILE}" +sourceFile "${MODEL_FILE}" +sourceFile "${LAUNCH_FILE}" +# if [[ -f "${LAUNCH_FILE}" ]]; then +# echo "source-ing ${LAUNCH_FILE}" +# # shellcheck source=./launch.sh +# source "${LAUNCH_FILE}" +# else +# echo "ERROR: UNABLE TO SOURCE ${LAUNCH_FILE}" +# fi + +setup +elasticDistributed "$@" +wait $! diff --git a/ALCF/sweep_args_theta_multinode.sh b/ALCF/sweep_args_theta_multinode.sh new file mode 100755 index 0000000000..ac6409233a --- /dev/null +++ b/ALCF/sweep_args_theta_multinode.sh @@ -0,0 +1,164 @@ +#!/bin/bash -l + +module load conda/2023-01-11 +conda activate base +# cd /home/czh5/seq/Megatron-DS-Benchmarking/ALCF +# source /home/czh5/seq/Megatron-DS-Benchmarking/venvs/thetaGPU/2023-01-11-deepspeed/bin/activate + +# rm -rf /home/czh5/genome/Megatron-DeepSpeed/dataset/*.npy +# rm -rf /home/czh5/genome/Megatron-DeepSpeed/dataset/*.done +# +SCRIPT_PATH="${BASH_SOURCE[0]}" +while [ -L "$SCRIPT_PATH" ]; do + SCRIPT_DIR="$(cd -P "$(dirname "$SCRIPT_PATH")" >/dev/null 2>&1 && pwd)" + SCRIPT_PATH="$(readlink "$SCRIPT_PATH")" + [[ ${SCRIPT_PATH} != /* ]] && SCRIPT_PATH="${SCRIPT_DIR}/${SCRIPT_PATH}" +done +SCRIPT_PATH="$(readlink -f "$SCRIPT_PATH")" +SCRIPT_DIR="$(cd -P "$(dirname -- "$SCRIPT_PATH")" >/dev/null 2>&1 && pwd)" + +SOURCE=${BASH_SOURCE[0]} +while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink + DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) + SOURCE=$(readlink "$SOURCE") + [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +done +DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" + + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + +SETUP_FILE="${DIR}/setup.sh" +MODEL_FILE="${DIR}/model.sh" +ARGS_FILE="${DIR}/args.sh" +LAUNCH_FILE="${DIR}/launch.sh" + + +sourceFile "${SETUP_FILE}" +sourceFile "${MODEL_FILE}" +sourceFile "${ARGS_FILE}" +sourceFile "${LAUNCH_FILE}" + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +echo "My current script is: ${SCRIPT_DIR[0]}" + + +if [[ $(hostname) == theta* ]]; then + HOSTFILE="${COBALT_NODEFILE}" +elif [[ $(hostname) == x* ]]; then + HOSTFILE="${PBS_NODEFILE}" +else + echo "Unexpected hostname $(hostname)" +fi + +echo "Found hostfile: ${HOSTFILE}" + +NHOSTS=$(wc -l < "${HOSTFILE}") +NGPU_PER_HOST=$(nvidia-smi -L | wc -l) +PARALLEL_SIZE=$(( NHOSTS * NGPU_PER_HOST )) + +export MODEL_TYPE=${MODEL_TYPE:-"gpt"} # set bert or gpt +export SP_TYPE=${SP_TYPE:-"megatron"} # set ds or megatron + +K_VALS=( + # 2 + # 4 + 8 + # 16 + # 32 + # 64 + # 128 + # 192 + # 256 + # 272 + # 320 + # 384 + # 448 + # 512 + # 1024 +) + +SEQLEN_VALS=( +# 2048 +# 4096 +# 8192 +# 16384 +# 32768 +# 65536 +# 131072 +# 262144 +# 524288 +# 1048576 +# 2097152 +) + +MODEL_SIZE_VALS=( + # "GPT125M" + # "BERT1.2B" +# "GPT1_5B" + # "GPT2_7B" +# "GPT6_7B" + # "GPT13B" + "GPT25B" +# "GPT30B" +# "GPT33B" +) + +for MODEL_SIZE_KEY in "${MODEL_SIZE_VALS[@]}"; do + export MODEL_SIZE_KEY +# for SEQ_LEN in "${SEQLEN_VALS[@]}"; do +# export SEQ_LEN + for NUM_K in "${K_VALS[@]}"; do + # common_factor=$(( $PARALLEL_SIZE * 8 )) + # export SEQ_LEN=$(( 1024 * $NUM_K / $common_factor * $common_factor )) + + export SEQ_LEN=$(( 1024 * $NUM_K )) + + if [[ ${SP_TYPE} == "ds" ]]; then + echo "DS sequence parallel" + export SPSIZE=${PARALLEL_SIZE} + export MPSIZE=1 + export ZERO_STAGE=3 + export USE_SEQUENCE_PARALLEL=0 + bash ./benchmark_train.sh + fi + + if [[ ${SP_TYPE} == "megatron" ]]; then + echo "Megatron's sequence parallel" + + # if [ ${SEQ_LEN} -eq 8192 ]; then + # PARALLEL_SIZE=8 + # fi + + # if [ ${SEQ_LEN} -eq 16384 ]; then + # PARALLEL_SIZE=8 + # fi + + # if [ ${SEQ_LEN} -eq 32768 ]; then + # PARALLEL_SIZE=16 + # fi + + # if [ ${SEQ_LEN} -eq 65536 ]; then + # PARALLEL_SIZE=16 + # fi + + export SPSIZE=1 + export MPSIZE=${PARALLEL_SIZE} + export ZERO_STAGE=0 + export USE_SEQUENCE_PARALLEL=1 + bash ./benchmark_train.sh + fi + + printf "\n------------------------" + echo SEQ_LEN=${SEQ_LEN} + done +done diff --git a/ALCF/train-gpt3.sh b/ALCF/train-gpt3.sh new file mode 100755 index 0000000000..fa707b5df4 --- /dev/null +++ b/ALCF/train-gpt3.sh @@ -0,0 +1,71 @@ +#!/bin/bash --login + +TSTAMP=$(date "+%Y-%m-%d-%H%M%S") + +# HERE=$(python3 -c 'import os; print(os.getcwd())') +# ALCF_DIR="${HERE}/ALCF" +# +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' +} + +HERE=$(WhereAmI) +ALCF_DIR=$(find "${HERE}" -name "ALCF") + + +# ALCF_DIR="$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1)))/ALCF" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + +# SOURCE=${BASH_SOURCE[0]} +# while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# SOURCE=$(readlink "$SOURCE") +# [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +# done +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ Make sure we're not already running; if so, exit here ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +PIDS=$(ps aux | grep pretrain_gpt.py | grep -v grep | awk '{print $2}') +if [ -n "${PIDS}" ]; then + echo "Already running! Exiting!" + exit 1 +fi + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ source ./launch.sh ┃ +#┃ which then sources ./{args.sh,setup.sh} ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +LAUNCH_FILE="${ALCF_DIR}/launch.sh" + +sourceFile "${ALCF_DIR}/setup.sh" +sourceFile "${ALCF_DIR}/model.sh" +sourceFile "${ALCF_DIR}/args.sh" +sourceFile "${LAUNCH_FILE}" + +setup +# singleGPU "$@" 2>&1 & +# fullNode "$@" 2>&1 & +TORCH_VERSION=$(python3 -c 'import torch; print(torch.__version__)') +export TORCH_VERSION=$TORCH_VERSION +export CUDA_DEVICE_MAX_CONNECTIONS=1 +# elasticDistributed "$@" 2>&1 & +# elasticDistributed "$@" +# PID=$! +# wait $PID +elasticDistributed "$@" 2>&1 & From 2a03c730baa8375a40b037699969ffc16a8bd69d Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 3 Oct 2023 21:46:28 -0500 Subject: [PATCH 002/268] Update `ALCF/*` --- ALCF/args.sh | 767 ++++++++++++----------------- ALCF/benchmark_train.sh | 82 --- ALCF/launch.sh | 234 +++++---- ALCF/model.sh | 22 + ALCF/mpi_wrapper | 27 - ALCF/setup.sh | 408 +++++++-------- ALCF/sweep_args_theta_multinode.sh | 164 ------ ALCF/train-gpt3.sh | 5 +- 8 files changed, 665 insertions(+), 1044 deletions(-) delete mode 100755 ALCF/benchmark_train.sh delete mode 100755 ALCF/mpi_wrapper delete mode 100755 ALCF/sweep_args_theta_multinode.sh diff --git a/ALCF/args.sh b/ALCF/args.sh index f452d3b7aa..200fd21fc9 100755 --- a/ALCF/args.sh +++ b/ALCF/args.sh @@ -1,90 +1,45 @@ -#!/bin/bash -login - -# SCRIPT_PATH="${BASH_SOURCE[0]}" -# while [ -L "$SCRIPT_PATH" ]; do -# SCRIPT_DIR="$(cd -P "$(dirname "$SCRIPT_PATH")" >/dev/null 2>&1 && pwd)" -# SCRIPT_PATH="$(readlink "$SCRIPT_PATH")" -# [[ ${SCRIPT_PATH} != /* ]] && SCRIPT_PATH="${SCRIPT_DIR}/${SCRIPT_PATH}" -# done -# SCRIPT_PATH="$(readlink -f "$SCRIPT_PATH")" -# SCRIPT_DIR="$(cd -P "$(dirname -- "$SCRIPT_PATH")" >/dev/null 2>&1 && pwd)" -# -# SOURCE=${BASH_SOURCE[0]} -# while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink -# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) -# SOURCE=$(readlink "$SOURCE") -# [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located -# done -# DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" +#!/bin/bash --login function FindMegatron() { - MEGATRON_INSTALL=$(python3 -c 'import megatron; print(megatron.__file__)' | tail -1) - MEGATRON_DIR=$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1))) + MEGATRON_INSTALL=$(python3 -c 'import megatron; print(megatron.__file__)' | tail -1) + MEGATRON_DIR=$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1))) } function WhereAmI() { - python3 -c 'import os; print(os.getcwd())' + python3 -c 'import os; print(os.getcwd())' +} + +function join_by { local d=${1-} f=${2-}; if shift 2; then printf %s "$f" "${@/#/$d}"; fi; } + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi } -HERE=$(WhereAmI) -ALCF_DIR=$(find "${HERE}" -name "ALCF") -# [ "${MEGATRON_DIR}" ] && echo "Caught ${MEGATRON_DIR} from env" || FindMegatron -# ALCF_DIR="${MEGATRON_DIR}/ALCF" -# ALCF_DIR="$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1)))/ALCF" +USER=$(whoami) +HERE=$(WhereAmI) +ALCF_DIR="${HERE}/ALCF" PARENT=$(dirname "${ALCF_DIR}") echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" echo "ALCF_DIR: ${ALCF_DIR}" echo "PARENT: ${PARENT}" echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" -function sourceFile() { - FILE="$1" - echo "source-ing ${FILE}" - if [[ -f "${FILE}" ]]; then - # shellcheck source="${FILE}" - source "${FILE}" - else - echo "ERROR: UNABLE TO SOURCE ${FILE}" - fi -} - -USER=$(whoami) -# # DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd -LP) -# PARENT=$(dirname "$DIR") -# -# HERE=$(python3 -c 'import os; print(os.getcwd())') - -# echo "------------------------" -# echo "SCRIPT_DIR=$SCRIPT_DIR" -# echo "SCRIPT_PATH=$SCRIPT_PATH" -# echo "------------------------" -# echo "SOURCE=$SOURCE" -# echo "DIR=$DIR" -# echo "PARENT: ${PARENT}" -# echo "HERE: ${HERE}" -# echo "------------------------" - -if [[ $(hostname) == theta* ]]; then - echo "Setting up ThetaGPU from $(hostname)" - HOSTFILE="${COBALT_NODEFILE}" -elif [[ $(hostname) == x* ]]; then - echo "Setting up Polaris from $(hostname)" - HOSTFILE="${PBS_NODEFILE}" -else - echo "Unexpected hostname $(hostname)" -fi +HOSTNAME=$(hostname) +sourceFile "${ALCF_DIR}/setup.sh" -NHOSTS=$(wc -l < "${HOSTFILE}") -NGPU_PER_HOST=$(nvidia-smi -L | wc -l) -NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" WORLD_SIZE="${NGPUS}" PARALLEL_SIZE="${WORLD_SIZE}" -# # NGPUS="$((${NHOSTS}*${NGPU_PER_HOST}))" echo "NHOSTS * (NGPU / HOST) = $NHOSTS * $NGPU_PER_HOST = $NGPUS" - export MODEL_SIZE_KEY="${MODEL_SIZE_KEY:-GPT13B}" echo "==========================+" echo "Using ${MODEL_SIZE_KEY}" @@ -97,30 +52,12 @@ MODEL_TYPE=${MODEL_TYPE:-gpt} # ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ # ┃ Model Parallel / Pipeline Parallel ┃ # ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ -# ---------- -# Originals -# MPSIZE=8 -# PPSIZE=16 -# ---------- -# NHOSTS=$(wc -l < "${PBS_NODEFILE}") export DDP_IMPL="local" # FSDP | local | torch -export USE_FLASH_ATTN=${USE_FLASH_ATTN:-0} # 1 | 0 export USE_ACTIVATION_CHECKPOINTING=1 # 1 | 0 export SEQ_LEN=${SEQ_LEN:-2048} export PPSIZE=${PPSIZE:-1} -# export MPSIZE=${MPSIZE:-1} -# export SPSIZE=${SPSIZE:-1} export MICRO_BATCH=${MICRO_BATCH:-1} -# export ZERO_STAGE=${ZERO_STAGE:-1} # 0 | 1 | 2 | 3 export GRADIENT_ACCUMULATION_STEPS=${GAS:-1} -# export NHOSTS="$NHOSTS" -# export USE_SEQUENCE_PARALLEL=${USE_SEQUENCE_PARALLEL:-0} # 1 | 0 - -# NHOSTS=$(wc -l < "${COBALT_NODEFILE}") -# # NGPU_PER_HOST=8 -# NGPU_PER_HOST=$(nvidia-smi -L | wc -l) -# PARALLEL_SIZE=$((${NHOSTS}*${NGPU_PER_HOST})) - export MODEL_TYPE=${MODEL_TYPE:-"gpt"} # set bert or gpt export SP_TYPE=${SP_TYPE:-"megatron"} # set ds or megatron @@ -128,115 +65,76 @@ export SP_TYPE=${SP_TYPE:-"megatron"} # set ds or megatron # Deal with Sequence Parallel implementation --------------------------------------- # ---------------------------------------------------------------------------------- if [[ ${SP_TYPE} == "ds" ]]; then - # NOTE: -------------------------------------------------------------------------- - # SP_TYPE="ds" has NO effect, essentially running with no Seq. || - # -------------------------------------------------------------------------------- - USE_SEQUENCE_PARALLEL=0 - if [[ "$MPSIZE" == "${WORLD_SIZE}" ]]; then - echo "Caught MPSIZE: $MPSIZE from env. Setting SPSIZE=1" - SPSIZE=1 - MPSIZE="${MPSIZE}" - else - echo "Didn't catch MPSIZE from env. Setting SPSIZE=${WORLD_SIZE}, MPSIZE=1" - MPSIZE=1 - SPSIZE="${WORLD_SIZE}" - fi - # if [[ "$MPSIZE" != 0 ]]; then - # SPSIZE=$(( WORLD_SIZE - MPSIZE )) - # echo "############################################################" - # echo "Caught MPSIZE: $MPSIZE from env!" - # echo "Setting SPSIZE: (${WORLD_SIZE} - ${MPSIZE}) = ${SPSIZE}" - # echo "############################################################" - # else - # MPSIZE=1 - # SPSIZE="$WORLD_SIZE" - # echo "############################################################" - # echo "Setting MPSIZE: $SPSIZE, SPSIZE: $WORLD_SIZE = $SPSIZE" - # echo "############################################################" - # fi - # [ "$MPSIZE" ] && SPSIZE=1 || SPSIZE="${WORLD_SIZE}" - # [ "$SPSIZE" ] && MPSIZE=1 || MPSIZE="${}" - # [ "$MPSIZE" = "$WORLD_SIZE" ] && SPSIZE=1 || SPSIZE="$WORLD_SIZE" - # [ "$SPSIZE" = "$WORLD_SIZE" ] && MPSIZE=1 || MPSIZE="${WORLD_SIZE}" - # export SPSIZE="${WORLD_SIZE}" - export SPSIZE="${SPSIZE:-$WORLD_SIZE}" - export MPSIZE="${MPSIZE:-1}" - export USE_SEQUENCE_PARALLEL=0 - if [ -z "${ZERO_STAGE}" ]; then - echo "ZERO_STAGE not set, setting to 3 for ${SP_TYPE}" - ZERO_STAGE=3 - else - echo "Caught ZERO_STAGE=${ZERO_STAGE} with ${SP_TYPE}" - fi - export ZERO_STAGE="${ZERO_STAGE}" + # NOTE: -------------------------------------------------------------------- + # SP_TYPE="ds" has NO effect, essentially running with no Seq. parallelism + # -------------------------------------------------------------------------- + if [[ "$MPSIZE" == "${WORLD_SIZE}" ]]; then + # hacky workaround to try and use SP_TYPE="ds" + MPSIZE="${WORLD_SIZE}" + # ------------------------------------------------------------------------ + # Update [2023-08-22]: Chengming mentioned that this is an internal issue + # and will NOT work currently + # ------------------------------------------------------------------------ + echo "Caught MPSIZE: $MPSIZE from env. Setting SPSIZE=1" + SPSIZE=1 + MPSIZE="${MPSIZE}" + else + echo "Didn't catch MPSIZE from env. Setting SPSIZE=${WORLD_SIZE}, MPSIZE=1" + MPSIZE=1 + SPSIZE="${WORLD_SIZE}" + fi + if [ -z "${ZERO_STAGE}" ]; then + echo "ZERO_STAGE not set, setting to 3 for ${SP_TYPE}" + ZERO_STAGE=3 + else + echo "Caught ZERO_STAGE=${ZERO_STAGE} with ${SP_TYPE}" + fi + export SPSIZE="${SPSIZE:-$WORLD_SIZE}" + export MPSIZE="${MPSIZE:-1}" + export USE_SEQUENCE_PARALLEL=0 + export ZERO_STAGE="${ZERO_STAGE}" elif [[ ${SP_TYPE} == "megatron" ]]; then - # NOTE: -------------------------------------------------------------------------- - # SP_TYPE="megatron" will use Megatron's Seq. || implementation with ZERO_STAGE=0 - # -------------------------------------------------------------------------------- - # export SPSIZE=1 - # export MPSIZE="${WORLD_SIZE}" - [ "$SPSIZE" ] && echo "Caught SPSIZE: ${SPSIZE} from env" || SPSIZE=1 - [ "$MPSIZE" ] && echo "Caught MPSIZE: ${MPSIZE} from env" || MPSIZE="${WORLD_SIZE}" - [ "$ZERO_STAGE" ] && echo "Caught ${ZERO_STAGE} from env" || ZERO_STAGE=0 - # [ "$USE_SEQUENCE_PARALLEL" = 0 ] && export USE_SEQUENCE_PARALLEL=0 || export USE_SEQUENCE_PARALLEL=1 - # if [[ "$SPSIZE" == 0 ]]; then - # echo "Caught SPSIZE=$SPSIZE from env!!" - # USE_SEQUENCE_PARALLEL=0 - # else - # USE_SEQUENCE_PARALLEL=1 - # fi - # if [[ "$USE_SEQUENCE_PARALLEL" ]]; then - # echo "Caught USE_SEQUENCE_PARALLEL=${USE_SEQUENCE_PARALLEL} from env!!" - # [ "${SPSIZE}" != 0 ] && USE_SEQUENCE_PARALLEL=1 || USE_SEQUENCE_PARALLEL=0 - # [ "$USE_SEQUENCE_PARALLEL" = 0 ] && echo "Not using sequence parallelism" || USE_SEQUENCE_PARALLEL=1 - export SPSIZE="${SPSIZE}" - export MPSIZE="${MPSIZE}" - export ZERO_STAGE="${ZERO_STAGE}" - export USE_SEQUENCE_PARALLEL="${USE_SEQUENCE_PARALLEL:-1}" - # if [[ "${SPSIZE}" == 0 ]]; then - # [ "$SPSIZE" ] && USE_SEQUENCE_PARALLEL=1 || USE_SEQUENCE_PARALLEL=0 - # echo "Caught SPSIZE=${SPSIZE} from env, with ${SP_TYPE} sequence parallelism" - # export SPSIZE="${SPSIZE}" - # export USE_SEQUENCE_PARALLEL=0 - # else - # export SPSIZE=1 - # export USE_SEQUENCE_PARALLEL=1 - # fi - # if [ -z "${ZERO_STAGE}" ]; then - # echo "ZERO_STAGE not set, setting to 0 for ${SP_TYPE}" - # ZERO_STAGE=0 - # else - # echo "Caught ZERO_STAGE=${ZERO_STAGE} with ${SP_TYPE}" - # fi + # NOTE: -------------------------------------------------------------------------- + # SP_TYPE="megatron" will use Megatron's Seq. || implementation with ZERO_STAGE=0 + # -------------------------------------------------------------------------------- + [ "$SPSIZE" ] && echo "Caught SPSIZE: ${SPSIZE} from env" || SPSIZE=1 + [ "$MPSIZE" ] && echo "Caught MPSIZE: ${MPSIZE} from env" || MPSIZE="${WORLD_SIZE}" + [ "$ZERO_STAGE" ] && echo "Caught ${ZERO_STAGE} from env" || ZERO_STAGE=0 + [ "$USE_SEQUENCE_PARALLEL" ] && echo "Caught USE_SP: $USE_SEQUENCE_PARALLEL from env" || USE_SEQUENCE_PARALLEL=1 + export SPSIZE="${SPSIZE}" + export MPSIZE="${MPSIZE}" + export ZERO_STAGE="${ZERO_STAGE}" + export USE_SEQUENCE_PARALLEL="${USE_SEQUENCE_PARALLEL:-1}" else - echo "Unexpected SP_TYPE: ${SP_TYPE}" - exit 1 + echo "Unexpected SP_TYPE: ${SP_TYPE}" + # exit 1 fi # ------------------------------------------------------------------------ echo "####################################################" -echo "# USING: ${SP_TYPE}" -echo "# SPSIZE: ${SPSIZE}" -echo "# PPSIZE: ${SPSIZE}" -echo "# MPSIZE: ${MPSIZE}" -echo "# ZERO_STAGE: ${ZERO_STAGE}" -echo "# WORLD_SIZE: ${WORLD_SIZE}" -echo "# USE_SEQUENCE_PARALLEL: ${USE_SEQUENCE_PARALLEL}" +echo "USING: ${SP_TYPE}" +echo "SPSIZE: ${SPSIZE}" +echo "PPSIZE: ${SPSIZE}" +echo "MPSIZE: ${MPSIZE}" +echo "ZERO_STAGE: ${ZERO_STAGE}" +echo "WORLD_SIZE: ${WORLD_SIZE}" +echo "USE_SEQUENCE_PARALLEL: ${USE_SEQUENCE_PARALLEL}" echo "####################################################" -echo "########################################################" -echo "| ${SP_TYPE} sequence parallelism, with: " -echo "| {MPSIZE: ${MPSIZE}, SPSIZE: ${SPSIZE}, USE_SEQUENCE_PARALLEL: ${USE_SEQUENCE_PARALLEL}} !!" +echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++" +echo "${SP_TYPE} sequence parallelism, with: " +echo " {MPSIZE: ${MPSIZE}, SPSIZE: ${SPSIZE}, USE_SEQUENCE_PARALLEL: ${USE_SEQUENCE_PARALLEL}} !!" echo "########################################################" -# GLOBAL_BATCH=1 -# GLOBAL_BATCH=$(( $GLOBAL_BATCH / $MPSIZE / $PPSIZE / $SPSIZE )) -# GLOBAL_BATCH=$(( $NGPUS * $MICRO_BATCH * $GRADIENT_ACCUMULATION_STEPS )) GLOBAL_BATCH=$(( NGPUS * MICRO_BATCH * GRADIENT_ACCUMULATION_STEPS )) echo "GB = NGPUS * MB * GAS = ${NGPUS} * ${MICRO_BATCH} * ${GRADIENT_ACCUMULATION_STEPS} = ${GLOBAL_BATCH}" GLOBAL_BATCH=$(( GLOBAL_BATCH / MPSIZE / PPSIZE / SPSIZE)) echo "GB = (NGPUS * MB * GAS) / (MP * PP * SP) = (${NGPUS} * ${MICRO_BATCH} * ${GRADIENT_ACCUMULATION_STEPS}) / (${MPSIZE} * ${PPSIZE} * ${SPSIZE}) = ${GLOBAL_BATCH}" + +if [[ "${GLOBAL_BATCH}" == 0 ]]; then + GLOBAL_BATCH=1 +fi +# [ "${GLOBAL_BATCH:-${GLOBAL_BATCH}}" == 0 ] && GLOBAL_BATCH=1 || echo "GLOBAL_BATCH: ${GLOBAL_BATCH}" export GLOBAL_BATCH="$GLOBAL_BATCH" echo "--------------------------------" @@ -246,19 +144,17 @@ echo "--------------------------------" # ┏━━━━━━━━━━━━┓ # ┃ Data paths ┃ # ┗━━━━━━━━━━━━┛ +[ "$(hostname)==login*" ] && MEGATRON_DIR="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" +[ "$(hostname)==nid*" ] && MEGATRON_DIR="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" +[ "$(hostname)==theta*" ] && MEGATRON_DIR="/lus/grand/projects/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DS-Benchmarking" +[ "$(hostname)==x3*" ] && MEGATRON_DIR="/lus/grand/projects/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DS-Benchmarking" + # DATA_PATH=/lus/grand/projects/datascience/vsastry/genslm_subsample_200k_sequence_document/genslm_subsample_200k_sequence_document -DATA_DIR="${PARENT}/dataset" +DATA_DIR="${MEGATRON_DIR}/dataset" DATA_PATH="${DATA_DIR}/BookCorpusDataset_text_document" VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" -# DATA_PATH="/home/czh5/genome/Megatron-DeepSpeed/dataset/BookCorpusDataset_text_document" -# VOCAB_FILE="/home/czh5/genome/Megatron-DeepSpeed/dataset/gpt2-vocab.json" -# MERGE_FILE="/home/czh5/genome/Megatron-DeepSpeed/dataset/gpt2-merges.txt" -# DATA_PATH="/lus/eagle/projects/MDClimSim/chengming/gpt_datasets1/BookCorpusDataset_text_document" -# VOCAB_FILE="/lus/eagle/projects/MDClimSim/chengming/gpt_datasets1/gpt2-vocab.json" -# MERGE_FILE="/lus/eagle/projects/MDClimSim/chengming/gpt_datasets1/gpt2-merges.txt" - # ┏━━━━━━━━━━━━━━━━━━━┓ # ┃ FILE I/O SETTINGS ┃ # ┗━━━━━━━━━━━━━━━━━━━┛ @@ -268,17 +164,35 @@ RUN_STR="mp${MPSIZE}_pp${PPSIZE}_sp${SPSIZE}_${RUN_STR}" RUN_STR="z${ZERO_STAGE}_seqlen${SEQ_LEN}_${RUN_STR}" RUN_STR="${MODEL_SIZE}_${RUN_STR}" -if [[ $USE_FLASH_ATTN == 1 ]] ; then - RUN_STR="flashAttn_${RUN_STR}" +# if [[ "${USE_FLASH_ATTN}" == 0 ]]; then +# echo "Not using Flash Attention!!" +# else +# +if [[ "${USE_FLASH_ATTN1}" || "${USE_FLASH_ATTN_V1}" ]]; then + # Flash Attention 1 + [ "${USE_FLASH_ATTN}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" + [ "${USE_FLASH_ATTN1}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" + [ "${USE_FLASH_ATTN_V1}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" +elif [[ "${USE_FLASH_ATTN2}" || "${USE_FLASH_ATTN_V2}" ]]; then + # Flash Attention 2 + [ "${USE_FLASH_ATTN2}" ] && RUN_STR="flashAttn_v2_${RUN_STR}" + [ "${USE_FLASH_ATTN_V2}" ] && RUN_STR="flashAttn_v2_${RUN_STR}" +elif [[ "${USE_FLASH_ATTN_TRITON}" ]]; then + # Triton + Flash Attn + # Triton + Flash Attn + [ "${USE_FLASH_ATTN_TRITON}" ] && RUN_STR="flashAttn_triton_${RUN_STR}" +else + echo "Not using Flash Attention!" fi + if [[ $DDP_IMPL == 'FSDP' ]]; then - RUN_STR="FSDP_${RUN_STR}" + RUN_STR="FSDP_${RUN_STR}" fi if [[ $USE_ACTIVATION_CHECKPOINTING == 1 ]] ;then - RUN_STR="actCkpt_${RUN_STR}" + RUN_STR="actCkpt_${RUN_STR}" fi if [[ $USE_SEQUENCE_PARALLEL == 1 ]] ; then - RUN_STR="SP_${RUN_STR}" + RUN_STR="SP_${RUN_STR}" fi RUN_STR="${MODEL_TYPE}_${RUN_STR}" @@ -287,6 +201,9 @@ OUTPUT_DIR="${PARENT}/outputs/${RUN_STR}" CHECKPOINT_DIR="${PARENT}/checkpoints/$RUN_STR" TENSORBOARD_DIR="${PARENT}/outputs/${RUN_STR}/tensorboard" +DATE=$(date) +export DATE="${DATE}" +export RUN_STR="${RUN_STR}" export MODEL_SIZE="$MODEL_SIZE" export TENSORBOARD_DIR=$TENSORBOARD_DIR export OUTPUT_DIR=$OUTPUT_DIR @@ -311,9 +228,9 @@ echo "OUTPUT TO: ${OUTPUT_DIR}" # echo "NVME_PATH: ${NVME_PATH}" if [[ $MODEL_TYPE == "gpt" ]] ; then - DATA_LOAD_ARGS="--data-path $DATA_PATH --vocab-file $VOCAB_FILE --merge-file $MERGE_FILE" + DATA_LOAD_ARGS="--data-path $DATA_PATH --vocab-file $VOCAB_FILE --merge-file $MERGE_FILE" else - DATA_LOAD_ARGS="" + DATA_LOAD_ARGS="" fi # Set to cpu for offloading to cpu for larger models @@ -329,319 +246,241 @@ CPU_OPTIM=" --cpu-optimizer" # ┗━━━━━━━━━━━━━━━━━━┛ DS_CONFIG=${PARENT}/ds_config-gpt.json echo "!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~!" -echo "! DS_CONFIG: ${DS_CONFIG}" +echo " DS_CONFIG: ${DS_CONFIG}" echo "!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~!" -# "optimizer": { -# "type": "Adam", -# "params": { -# "lr": 0.001, -# "betas": [0.8, 0.999], -# "eps": 1e-8, -# "weight_decay": 3e-7 -# } -# }, - -# "zero_allow_untested_optimizer": false, -# "train_batch_size" : $GLOBAL_BATCH, -# "zero_force_ds_cpu_optimizer": false, + if [[ $ZERO_STAGE == "3" ]] ; then -cat < "$DS_CONFIG" -{ - "train_micro_batch_size_per_gpu": $MICRO_BATCH, - "steps_per_print": 1, - "wall_clock_breakdown" : true, - "gradient_accumulation_steps": $GRADIENT_ACCUMULATION_STEPS, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 3, - "stage3_max_live_parameters": 3e9, - "stage3_max_reuse_distance": 3e9, - "stage3_param_persistence_threshold": 1e5, - "stage3_prefetch_bucket_size": 1e9, - "contiguous_gradients": true, - "overlap_comm": true, - "reduce_bucket_size": 90000000, - "sub_group_size": 5e7, - "offload_param": { - "device": "cpu", - "pin_memory": true + cat < "$DS_CONFIG" + { + "train_micro_batch_size_per_gpu": $MICRO_BATCH, + "steps_per_print": 1, + "wall_clock_breakdown" : true, + "gradient_accumulation_steps": $GRADIENT_ACCUMULATION_STEPS, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 3, + "stage3_max_live_parameters": 3e9, + "stage3_max_reuse_distance": 3e9, + "stage3_param_persistence_threshold": 1e5, + "stage3_prefetch_bucket_size": 1e9, + "contiguous_gradients": true, + "overlap_comm": true, + "reduce_bucket_size": 90000000, + "sub_group_size": 5e7, + "offload_param": { + "device": "cpu", + "pin_memory": true }, "offload_optimizer": { - "device": "cpu", - "buffer_count": 4, - "pipeline_read": false, - "pipeline_write": false, - "pin_memory": true - } - }, - "fp16": { - "enabled": true, - "initial_scale_power" : 12, - "loss_scale_window": 1000, - "hysteresis": 2, - "min_loss_scale": 1 - }, - "aio": { - "block_size": 1048576, - "queue_depth": 16, - "single_submit": false, - "overlap_events": true, - "thread_count": 2 - }, - "flops_profiler": { - "enabled": true, - "profile_step": 1, - "module_depth": -1, - "top_modules": 3, - "detailed": true, - "output_file": null - }, - "comms_logger": { - "enabled": true, - "verbose": false, - "prof_all": false, - "debug": false - }, - "wandb": { - "enabled": true, - "project": "Megatron-DS-Benchmarking" - } + "device": "cpu", + "buffer_count": 4, + "pipeline_read": false, + "pipeline_write": false, + "pin_memory": true +} +}, +"fp16": { +"enabled": true, +"initial_scale_power" : 12, +"loss_scale_window": 1000, +"hysteresis": 2, +"min_loss_scale": 1 +}, +"aio": { +"block_size": 1048576, +"queue_depth": 16, +"single_submit": false, +"overlap_events": true, +"thread_count": 2 +}, +"flops_profiler": { +"enabled": true, +"profile_step": 1, +"module_depth": -1, +"top_modules": 3, +"detailed": true, +"output_file": null +}, +"comms_logger": { +"enabled": true, +"verbose": false, +"prof_all": false, +"debug": false +}, +"wandb": { +"enabled": true, +"project": "GenSLM-Megatron-DS" +} } EOT else -cat < "$DS_CONFIG" -{ - "train_micro_batch_size_per_gpu": $MICRO_BATCH, - "gradient_accumulation_steps": $GRADIENT_ACCUMULATION_STEPS, - "steps_per_print": 1, - "wall_clock_breakdown" : true, - "zero_force_ds_cpu_optimizer": false, - "zero_optimization": { - "stage": $ZERO_STAGE, - "allgather_partitions": true, - "reduce_scatter": true, - "allgather_bucket_size": 5e8, - "overlap_comm": true, - "contiguous_gradients": true, + cat < "$DS_CONFIG" + { + "train_micro_batch_size_per_gpu": $MICRO_BATCH, + "gradient_accumulation_steps": $GRADIENT_ACCUMULATION_STEPS, + "steps_per_print": 1, + "wall_clock_breakdown" : true, + "zero_force_ds_cpu_optimizer": false, + "zero_optimization": { + "stage": $ZERO_STAGE, + "allgather_partitions": true, + "reduce_scatter": true, + "allgather_bucket_size": 5e8, + "overlap_comm": true, + "contiguous_gradients": true, + "offload_param": { + "device": "cpu", + "nvme_path": "/raid/scratch", + "pin_memory": false + }, "offload_optimizer": { - "device": "cpu" - } - }, - "optimizer": { - "type": "OneBitAdam" - }, - "scheduler": { - "type": "WarmupLR", - "params": { - "warmup_min_lr": 0, - "warmup_max_lr": 0.001, - "warmup_num_steps": 1000 - } - }, - "fp16": { - "enabled": true, - "initial_scale_power": 12 - }, - "flops_profiler": { - "enabled": true, - "profile_step": 1, - "module_depth": -1, - "top_modules": 3, - "detailed": true, - "output_file": null - }, - "comms_logger": { - "enabled": true, - "verbose": false, - "prof_all": false, - "debug": false - }, - "wandb": { - "enabled": true, - "project": "Megatron-DS-Benchmarking" - } + "device": "cpu", + "nvme_path": "/raid/scratch/" +} +}, +"scheduler": { +"type": "WarmupLR", +"params": { +"warmup_min_lr": 0, +"warmup_max_lr": 0.001, +"warmup_num_steps": 1000 +} +}, +"fp16": { +"enabled": true, +"initial_scale_power": 12 +}, +"flops_profiler": { +"enabled": true, +"profile_step": 1, +"module_depth": -1, +"top_modules": 3, +"detailed": true, +"output_file": null +}, +"comms_logger": { +"enabled": true, +"verbose": false, +"prof_all": false, +"debug": false +}, +"wandb": { +"enabled": true, +"project": "GenSLM-Megatron-DS" +} } EOT fi -# "optimizer": { -# "type": "Adam", -# "params": { -# "lr": 0.001, -# "betas": [0.8, 0.999], -# "eps": 1e-8, -# "weight_decay": 3e-7 -# } -# }, -# -# "offload_optimizer": { -# "device": "$OFFLOAD_DEVICE", -# "buffer_count": 4, -# "pipeline_read": false, -# "pipeline_write": false, -# "pin_memory": true -# } -# "train_batch_size" : $GLOBAL_BATCH, -# 'offload_optimizer': 'cpu' - # "train_batch_size" : $GLOBAL_BATCH, -# "offload_optimizer": { -# "device": "cpu", -# "nvme_path": "/raid/scratch/" -# } -# -# "optimizer": { -# "type": "AdamW", -# "params": { -# "lr": 0.001, -# "betas": [0.8, 0.999], -# "eps": 1e-8, -# "weight_decay": 3e-7 -# } -# }, -# "optimizer": { -# "type": "OneBitAdam", -# "params": { -# "lr": 0.001, -# "betas": [ -# 0.8, -# 0.999 -# ], -# "eps": 1e-8, -# "weight_decay": 3e-7, -# "freeze_step": 400, -# "cuda_aware": false, -# "comm_backend_name": "nccl" -# } -# }, -# -# "optimizer": "Adam", -# "optimizer": { -# "type": "OneBitAdam", -# "params": { -# "lr": 0.001, -# "betas": [ -# 0.8, -# 0.999 -# ], -# "eps": 1e-8, -# "weight_decay": 3e-7, -# "freeze_step": 400, -# "cuda_aware": true, -# "comm_backend_name": "nccl" -# } -# }, -# -# -# 'deepspeed_mpi': True, -# 'ds_pipeline_enabled': False, -# 'rank': 0, -# 'world_size': 1, -# 'transformer_pipeline_model_parallel_size': 1, -# 'data_parallel_size': 1, -# 'virtual_pipeline_model_parallel_ size': None, - # ┏━━━━━━━━━━━━━━━━━━━━━┓ # ┃ DeepSpeed Arguments ┃ # ┗━━━━━━━━━━━━━━━━━━━━━┛ if [[ "$DDP_IMPL" != "FSDP" ]] ; then - ds_args="" - ds_args=" --deepspeed ${ds_args}" - ds_args=" --deepspeed_mpi ${ds_args}" - ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" - ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" - if [[ "$PPSIZE" == 1 ]]; then - ds_args="--no-pipeline-parallel ${ds_args}" - else - ds_args=" --pipeline-model-parallel-size ${PPSIZE} ${ds_args}" - fi - if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then - ds_args=" --deepspeed-activation-checkpointing ${ds_args}" - fi + ds_args="" + ds_args=" --deepspeed ${ds_args}" + ds_args=" --deepspeed_mpi ${ds_args}" + ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" + ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" + if [[ "$PPSIZE" == 1 ]]; then + ds_args="--no-pipeline-parallel ${ds_args}" + else + ds_args=" --pipeline-model-parallel-size ${PPSIZE} ${ds_args}" + fi + if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + ds_args=" --deepspeed-activation-checkpointing ${ds_args}" + fi fi # ┏━━━━━━━━━━━━━━━━━━━━━━┓ # ┃ MEGATRON-LM SETTINGS ┃ # ┗━━━━━━━━━━━━━━━━━━━━━━┛ -# "--sequence-parallel-size ${SPSIZE}" gpt_args=( - "--seed ${RANDOM}" - "--DDP-impl ${DDP_IMPL}" - "--pipeline-model-parallel-size ${PPSIZE}" - "--tensor-model-parallel-size ${MPSIZE}" - "--num-layers ${NLAYERS}" - "--hidden-size ${HIDDEN}" - "--num-attention-heads ${ATEN_HEADS}" - "--micro-batch-size ${MICRO_BATCH}" - "--global-batch-size ${GLOBAL_BATCH}" - "--seq-length ${SEQ_LEN}" - "--max-position-embeddings ${SEQ_LEN}" - "--train-iters 10" - "--lr-decay-iters 320000" - "--num-workers 1" - "$DATA_LOAD_ARGS" - "--data-impl mmap" - "--split 949,50,1" - "--distributed-backend nccl" - "--lr 0.00015" - "--lr-decay-style cosine" - "--min-lr 1.0e-5" - "--weight-decay 1e-2" - "--clip-grad 1.0" - "--lr-warmup-fraction .01" - "--log-interval 1" - "--save-interval 1000" - "--eval-interval 1000" - "--eval-iters 10" - "--override-opt_param-scheduler" - "--tensorboard-dir ${TENSORBOARD_DIR}" - "--log-timers-to-tensorboard" - "--tensorboard-log-interval 1" + "--no-async-tensor-model-parallel-allreduce" + "--seed ${RANDOM}" + "--DDP-impl ${DDP_IMPL}" + "--pipeline-model-parallel-size ${PPSIZE}" + "--tensor-model-parallel-size ${MPSIZE}" + "--ds-sequence-parallel-size ${SPSIZE}" + "--num-layers ${NLAYERS}" + "--hidden-size ${HIDDEN}" + "--num-attention-heads ${ATEN_HEADS}" + "--micro-batch-size ${MICRO_BATCH}" + "--global-batch-size ${GLOBAL_BATCH}" + "--seq-length ${SEQ_LEN}" + "--max-position-embeddings ${SEQ_LEN}" + "--train-iters 10" + "--lr-decay-iters 320000" + "--num-workers 1" + "$DATA_LOAD_ARGS" + "--data-impl mmap" + "--split 949,50,1" + "--distributed-backend nccl" + "--lr 0.00015" + "--lr-decay-style cosine" + "--min-lr 1.0e-5" + "--weight-decay 1e-2" + "--clip-grad 1.0" + "--lr-warmup-fraction .01" + "--log-interval 1" + "--save-interval 1000" + "--eval-interval 1000" + "--eval-iters 0" + "--override-opt_param-scheduler" + "--tensorboard-dir ${TENSORBOARD_DIR}" + "--log-timers-to-tensorboard" + "--tensorboard-log-interval 1" ) -# --recompute-activations \ -# --recompute-granularity full \ -# --recompute-method uniform \ -# --recompute-num-layers 1 \ + if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then - gpt_args+=( + gpt_args+=( "--checkpoint-activations" "--checkpoint-num-layers 1" - ) +) fi if [[ "$DDP_IMPL" != "FSDP" ]] ; then - gpt_args+=( + gpt_args+=( # "${gpt_args[*]}" "--fp16" - ) +) else - gpt_args+=( + gpt_args+=( "--bf16" - ) +) fi -if [[ "$USE_FLASH_ATTN" == 1 ]] ; then - gpt_args+=( - "--use-flash-attn" - ) +# Flash Attention v1 +if [[ "${USE_FLASH_ATTN1}" || "${USE_FLASH_ATTN_V1}" ]]; then + [ "${USE_FLASH_ATTN}" ] && gpt_args+=("--use-flash-attn-v1") + [ "${USE_FLASH_ATTN1}" ] && gpt_args+=("--use-flash-attn-v1") + [ "${USE_FLASH_ATTN_V1}" ] && gpt_args+=("--use-flash-attn-v1") +# Flash Attention 2 +elif [[ "${USE_FLASH_ATTN2}" || "${USE_FLASH_ATTN_V2}" ]]; then + [ "${USE_FLASH_ATTN2}" ] && gpt_args+=("--use-flash-attn-v2") + [ "${USE_FLASH_ATTN_V2}" ] && gpt_args+=("--use-flash-attn-v2") +# Triton + Flash Attn +elif [[ "${USE_FLASH_ATTN_TRITON}" ]]; then + [ "${USE_FLASH_ATTN_TRITON}" ] && gpt_args+=("--use-flash-attn-triton") fi -# if [[ "$USE_SEQUENCE_PARALLEL" == 1 ]]; then -# gpt_args+=( -# "--sequence-parallel" -# ) -# fi +if [[ "$USE_SEQUENCE_PARALLEL" == 1 ]]; then + gpt_args+=( + "--sequence-parallel" +) +fi -if [[ "${SP_TYPE}" == "ds" ]]; then - gpt_args+=( +if [[ "$ZERO_STAGE" > "0" ]] ; then + gpt_args+=( "--cpu-optimizer" - ) +) fi export gpt_args=( - "${gpt_args[*]}" - "${ds_args[*]}" +"${gpt_args[*]}" +"${ds_args[*]}" ) +ARGS="$(join_by ' ' ${gpt_args[*]})" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -echo "gpt_args: ${gpt_args[*]}" +echo "ARGS: ${ARGS}" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" diff --git a/ALCF/benchmark_train.sh b/ALCF/benchmark_train.sh deleted file mode 100755 index 94eb94e116..0000000000 --- a/ALCF/benchmark_train.sh +++ /dev/null @@ -1,82 +0,0 @@ -#!/bin/bash --login - -TSTAMP=$(date "+%Y-%m-%d-%H%M%S") -# DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd -LP) -# - -#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ -#┃ Make sure we're not already running; if so, exit here ┃ -#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ -PIDS=$(ps aux | grep pretrain_gpt.py | grep -v grep | awk '{print $2}') -if [ -n "${PIDS}" ]; then - echo "Already running! Exiting!" - exit 1 -fi - - -SOURCE=${BASH_SOURCE[0]} -while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink - DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) - SOURCE=$(readlink "$SOURCE") - [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located -done -DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) - - -function sourceFile() { - FILE="$1" - echo "source-ing ${FILE}" - if [[ -f "${FILE}" ]]; then - # shellcheck source="${FILE}" - source "${FILE}" - else - echo "ERROR: UNABLE TO SOURCE ${FILE}" - fi -} - -SETUP_FILE="${DIR}/setup.sh" -MODEL_FILE="${DIR}/model.sh" -ARGS_FILE="${DIR}/args.sh" -LAUNCH_FILE="${DIR}/launch.sh" - - -sourceFile "${SETUP_FILE}" -sourceFile "${MODEL_FILE}" -sourceFile "${ARGS_FILE}" -sourceFile "${LAUNCH_FILE}" - - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -echo "My current script is: ${SCRIPT_DIR[0]}" - -NHOSTS=$(wc -l < "${COBALT_NODEFILE}") -# NGPU_PER_HOST=8 -NGPU_PER_HOST=$(nvidia-smi -L | wc -l) -PARALLEL_SIZE=$(( NHOSTS * NGPU_PER_HOST )) - -export MODEL_TYPE=${MODEL_TYPE:-"gpt"} # set bert or gpt -export SP_TYPE=${SP_TYPE:-"megatron"} # set ds or megatron - -echo "+-----------------------------+" -echo "| MODEL TYPE: ${MODEL_TYPE}" -echo "| SP TYPE: ${SP_TYPE}" -echo "+-----------------------------+" - -#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ -#┃ source ./launch.sh ┃ -#┃ which then sources ./{args.sh,setup.sh} ┃ -#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ - -export MODEL_TYPE=${MODEL_TYPE:-gpt} - -setup -# singleGPU "$@" 2>&1 & -# fullNode "$@" 2>&1 & -TORCH_VERSION=$(python3 -c 'import torch; print(torch.__version__)') -export TORCH_VERSION=$TORCH_VERSION -export CUDA_DEVICE_MAX_CONNECTIONS=1 -# fullNode "$@" -# elasticDistributed "$@" 2>&1 & -# elasticDistributed "$@" -PID=$! -wait $PID diff --git a/ALCF/launch.sh b/ALCF/launch.sh index 7604098667..c61a0e22ce 100755 --- a/ALCF/launch.sh +++ b/ALCF/launch.sh @@ -2,18 +2,17 @@ HOST=$(hostname) -# # DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd -LP) -# SOURCE=${BASH_SOURCE[0]} -# while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink -# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) -# SOURCE=$(readlink "$SOURCE") -# [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located -# done +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' +} + +# function join_by { +# local d=${1-} f=${2-}; if shift 2; then printf %s "$f" "${@/#/$d}"; fi; +# } -# HERE=$(python3 -c 'import os; print(os.getcwd())') -# ALCF_DIR="${HERE}/ALCF" -# -ALCF_DIR="$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1)))/ALCF" +USER=$(whoami) +HERE=$(WhereAmI) +ALCF_DIR="${HERE}/ALCF" PARENT=$(dirname "${ALCF_DIR}") echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" @@ -23,19 +22,16 @@ echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" function sourceFile() { - FILE="$1" - echo "source-ing ${FILE}" - if [[ -f "${FILE}" ]]; then - # shellcheck source=./setup.sh - source "${FILE}" - else - echo "ERROR: UNABLE TO SOURCE ${FILE}" - fi + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source=./setup.sh + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi } -# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) -# PARENT=$(dirname "${DIR}") - MASTER_ADDR=$(uname -n) MASTER_PORT=20010 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" @@ -46,106 +42,126 @@ sourceFile "${ALCF_DIR}/args.sh" MAIN="${PARENT}/pretrain_${MODEL_TYPE}.py" printJobInfo() { - echo "Job started at: ${TSTAMP} on $(hostname)" - echo "Job running in: ${DIR}" - echo "Training GPT-3 with ${MODEL_SIZE} parameters" - echo "Writing logs to: ${OUTPUT_DIR}" - echo 'to view output: tail -f $(tail -1 logfiles)' - echo "i.e. tail -f $(tail -1 "${PARENT}"/logfiles)" + echo "Job started at: ${TSTAMP} on $(hostname)" + echo "Job running in: ${DIR}" + echo "Training GPT-3 with ${MODEL_SIZE} parameters" + echo "Writing logs to: ${OUTPUT_DIR}" + echo 'to view output: tail -f $(tail -1 logfiles)' + echo "i.e. tail -f $(tail -1 "${PARENT}"/logfiles)" } launchJob() { - echo "using: $(which python3)" | tee -a "${OUTPUT_LOG}" - printJobInfo | tee -a "${OUTPUT_LOG}" - echo EXEC="${EXEC}" | tee -a "${OUTPUT_LOG}" - echo "Writing logs to: ${OUTPUT_LOG}" | tee -a "${OUTPUT_LOG}" - ${EXEC} "$@" # >> "${OUTPUT_LOG}" 2>&1 & -} - -singleGPU() { - echo "\ - Running on 1 host \ - with 1 GPUs each \ - for a total of 1 GPUs" - EXEC="\ - $(which python3) \ - ${MAIN} \ - ${gpt_args} \ - ${ds_args}" - OUTPUT_LOG="${OUTPUT_DIR}/logs/$USER-$HOST-nhosts1-ngpu1-$TSTAMP.log" - mkdir -p "$(dirname "${OUTPUT_LOG}")" - echo "${OUTPUT_LOG}" >> "${PARENT}/logfiles" - printJobInfo | tee -a "${OUTPUT_LOG}" - launchJob "$@" >> "${OUTPUT_LOG}" 2>&1 & + echo "using: $(which python3)" | tee -a "${OUTPUT_LOG}" + printJobInfo | tee -a "${OUTPUT_LOG}" + echo EXEC="${EXEC}" | tee -a "${OUTPUT_LOG}" + echo "Writing logs to: ${OUTPUT_LOG}" | tee -a "${OUTPUT_LOG}" + # ARGS="$@" + # export ARGS="$ARGS" + ${EXEC} "$@" # >> "${OUTPUT_LOG}" 2>&1 & } # ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ # ┃ Use all available GPUs a single nodes ┃ # ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ fullNode() { - echo "fullNode started" - echo "MPI_COMMAND ${MPI_COMMAND}" - echo "MPI_DEFAULTS ${MPI_DEFAULTS}" - echo "NGPUS ${NGPUS}" - echo "hostfile ${DIR}/hostfile" - echo "MAIN ${MAIN}" - echo "gpt_args ${gpt_args}" - NHOSTS=$(wc -l < "${HOSTFILE}") - NGPU_PER_HOST=$(nvidia-smi -L | wc -l) - # NGPU_PER_HOST=1 - NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) - # hostname > $DIR/hostfile - echo "\ - Running on $NHOSTS hosts \ - with $NGPU_PER_HOST GPUs each \ - for a total of $NGPUS GPUs" - EXEC="\ - ${MPI_COMMAND} \ - ${MPI_DEFAULTS} \ - "${MPI_ELASTIC}" - ${MPI_WRAPPER} ${MASTER_ADDR} ${MASTER_PORT} \ - ${MAIN} \ - ${gpt_args} \ - ${ds_args}" - OUTPUT_LOG="${OUTPUT_DIR}/logs/$USER-$HOST-nhosts${NHOSTS}-ngpu${NGPUS}-$TSTAMP.log" - mkdir -p "$(dirname "${OUTPUT_LOG}")" - echo "${OUTPUT_LOG}" >> "${PARENT}/logfiles" - printJobInfo | tee -a "${OUTPUT_LOG}" - launchJob "$@" 2>&1 | tee "${OUTPUT_LOG}" + echo "fullNode started" + echo "MPI_COMMAND ${MPI_COMMAND}" + echo "MPI_DEFAULTS ${MPI_DEFAULTS}" + echo "NGPUS ${NGPUS}" + echo "hostfile ${DIR}/hostfile" + echo "MAIN ${MAIN}" + echo "gpt_args ${gpt_args}" + NHOSTS=$(wc -l < "${HOSTFILE}") + NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) + # hostname > $DIR/hostfile + echo "\ + Running on $NHOSTS hosts \ + with $NGPU_PER_HOST GPUs each \ + for a total of $NGPUS GPUs" + _EXEC=( + "${MPI_COMMAND}" + "${MPI_DEFAULTS}" + "${MPI_ELASTIC}" + "${MPI_WRAPPER}" + "${MASTER_ADDR}" + "${MASTER_PORT}" + "${MAIN}" + "${gpt_args}" + "${ds_args}" + ) + # EXEC=$(join_by ' ' "${EXEC[*]}") + EXEC="${EXEC[*]}" + OUTPUT_LOG="${OUTPUT_DIR}/logs/$USER-$HOST-nhosts${NHOSTS}-ngpu${NGPUS}-$TSTAMP.log" + mkdir -p "$(dirname "${OUTPUT_LOG}")" + echo "${OUTPUT_LOG}" >> "${PARENT}/logfiles" + printJobInfo | tee -a "${OUTPUT_LOG}" + launchJob "$@" 2>&1 | tee "${OUTPUT_LOG}" } # ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ # ┃ Use all available GPUs on all available nodes ┃ # ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ elasticDistributed() { - NHOSTS=$(wc -l < "${HOSTFILE}") - NGPU_PER_HOST=$(nvidia-smi -L | wc -l) - NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) - export WORLD_SIZE="${NGPUS}" - echo "\ - Running on ${NHOSTS} hosts \ - with ${NGPU_PER_HOST} GPUs each \ - for a total of ${NGPUS} GPUs" - EXEC_STR=( - "${MPI_COMMAND}" - "${MPI_DEFAULTS}" - "${MPI_ELASTIC}" - "$(which python3)" - "${MAIN}" - "${gpt_args}" - "${ds_args}" - ) - EXEC="${EXEC_STR[*]}" - OUTPUT_LOG="${OUTPUT_DIR}/logs/$USER-$HOST-nhosts${NHOSTS}-ngpu${NGPUS}-$TSTAMP.log" - echo "Writing logs to: ${OUTPUT_LOG}" - mkdir -p "$(dirname "${OUTPUT_LOG}")" - echo "${OUTPUT_LOG}" >> "${PARENT}/logfiles" - printJobInfo | tee -a "${OUTPUT_LOG}" - # launchJob "$@" >> "${OUTPUT_LOG}" 2>&1 & - # launchJob "$@" - # printJobInfo | tee -a "${OUTPUT_LOG}" - # launchJob "$@" >> "${OUTPUT_LOG}" 2>&1 & - launchJob "$@" >> "${OUTPUT_LOG}" 2>&1 & - PID=$! - wait $PID + if [[ "$(hostname)==theta*" || "$(hostname)==x3*" ]]; then + if [[ $(hostname) == theta* ]]; then + echo "Setting up ThetaGPU from $(hostname)" + HOSTFILE="${COBALT_NODEFILE}" + elif [[ $(hostname) == x3* ]]; then + echo "Setting up Polaris from $(hostname)" + HOSTFILE="${PBS_NODEFILE}" + else + echo "Unknown hostname $(hostname)" + exit 1 + fi + NHOSTS=$(wc -l < "${HOSTFILE}") + NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + EXEC_STR=( + "${MPI_COMMAND}" + "${MPI_DEFAULTS}" + "${MPI_ELASTIC}" + "$(which python3)" + "${MAIN}" + "${gpt_args}" + "${ds_args}" + ) + elif [[ "$(hostname)==nid*" || "$(hostname)==login*" ]]; then + echo "Setting up from Perlmutter on $(hostname)" + [ "$(hostname)==nid*" ] && NHOSTS="$SLURM_NNODES" || NHOSTS=1 + [ "$(hostname)==nid*" ] && export MACHINE="perlmutter" || export MACHINE="NERSC" + NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + # NGPU_PER_HOST="$SLURM_GPUS_ON_NODE" + NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + # export MACHINE="perlmutter" + export MASTER_ADDR="127.0.0.1" + export MASTER_PORT="5432" + EXEC_STR=( + "srun" + "-N ${NHOSTS}" + "-n ${NGPUS}" + "-l -u" + "$(which python3)" + "${MAIN}" + "${gpt_args}" + "${ds_args}" + ) + else + echo "Unexpected hostname $(hostname)" + fi + export WORLD_SIZE="${NGPUS}" + echo "\ + Running on ${NHOSTS} hosts \ + with ${NGPU_PER_HOST} GPUs each \ + for a total of ${NGPUS} GPUs" + EXEC="${EXEC_STR[*]}" + OUTPUT_LOG="${OUTPUT_DIR}/logs/$USER-$HOST-nhosts${NHOSTS}-ngpu${NGPUS}-$TSTAMP.log" + echo "EXEC_STR: ${EXEC_STR}" + echo "Writing logs to: ${OUTPUT_LOG}" + mkdir -p "$(dirname "${OUTPUT_LOG}")" + echo "${OUTPUT_LOG}" >> "${PARENT}/logfiles" + printJobInfo | tee -a "${OUTPUT_LOG}" + launchJob "$@" >> "${OUTPUT_LOG}" 2>&1 & + PID=$! + wait $PID } diff --git a/ALCF/model.sh b/ALCF/model.sh index 2427893e9b..c41a908203 100755 --- a/ALCF/model.sh +++ b/ALCF/model.sh @@ -258,6 +258,28 @@ A_NLAYERS[$MODEL_145B_KEY]=80 A_HIDDEN[$MODEL_145B_KEY]=12288 A_ATEN_HEADS[$MODEL_145B_KEY]=96 +MODEL_1T_1L_KEY="GPT1T_1L" +A_NLAYERS[$MODEL_1T_1L_KEY]=1 +A_HIDDEN[$MODEL_1T_1L_KEY]=25600 +A_ATEN_HEADS[$MODEL_1T_1L_KEY]=160 + +MODEL_1T_2L_KEY="GPT1T_2L" +A_NLAYERS[$MODEL_1T_2L_KEY]=2 +A_HIDDEN[$MODEL_1T_2L_KEY]=25600 +A_ATEN_HEADS[$MODEL_1T_2L_KEY]=160 + +MODEL_1T_4L_KEY="GPT1T_4L" +A_NLAYERS[$MODEL_1T_4L_KEY]=4 +A_HIDDEN[$MODEL_1T_4L_KEY]=25600 +A_ATEN_HEADS[$MODEL_1T_4L_KEY]=160 + + +MODEL_1T_8L_KEY="GPT1T_8L" +A_NLAYERS[$MODEL_1T_8L_KEY]=8 +A_HIDDEN[$MODEL_1T_8L_KEY]=25600 +A_ATEN_HEADS[$MODEL_1T_8L_KEY]=160 + + export MODEL_SIZE="${MODEL_SIZE_KEY}" export NLAYERS="${A_NLAYERS[$MODEL_SIZE_KEY]}" export HIDDEN="${A_HIDDEN[$MODEL_SIZE_KEY]}" diff --git a/ALCF/mpi_wrapper b/ALCF/mpi_wrapper deleted file mode 100755 index 770f950e1d..0000000000 --- a/ALCF/mpi_wrapper +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env bash - -set -e - -PYTHON=${PYTHON:-python3} - -MASTER_ADDR="$1" -MASTER_PORT="$2" -TRAINING_SCRIPT="$3" - -shift 3 - -test -n "${MASTER_ADDR}" -a -n "${MASTER_PORT}" -a -n "${OMPI_COMM_WORLD_RANK}" -a -n "${OMPI_COMM_WORLD_SIZE}" -a -n "${OMPI_COMM_WORLD_LOCAL_RANK}" -test -f "${TRAINING_SCRIPT}" - -set -x - -LOCAL_RANK=$((OMPI_COMM_WORLD_RANK % 8)) - -exec env \ - MASTER_ADDR="${MASTER_ADDR}" \ - MASTER_PORT="${MASTER_PORT}" \ - RANK="${OMPI_COMM_WORLD_RANK}" \ - WORLD_SIZE="${OMPI_COMM_WORLD_SIZE}" \ - ${PYTHON} -u "${TRAINING_SCRIPT}" "--local_rank=${LOCAL_RANK}" "$@" - -exit 1 diff --git a/ALCF/setup.sh b/ALCF/setup.sh index 0b90daf02e..b3b135c513 100755 --- a/ALCF/setup.sh +++ b/ALCF/setup.sh @@ -1,228 +1,244 @@ #!/bin/bash --login # -# DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd -LP) -SOURCE=${BASH_SOURCE[0]} -while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink - DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) - SOURCE=$(readlink "$SOURCE") - [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located -done -DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) -PARENT=$(dirname "${DIR}") - -export CUDA_DEVICE_MAX_CONNECTIONS=1 - -thetagpuMPI() { - NHOSTS=$(wc -l < "${COBALT_NODEFILE}") - NGPU_PER_HOST=$(nvidia-smi -L | wc -l) - NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) - NVME_PATH="/raid/scratch/" - MPI_COMMAND=$(which mpirun) - # export PATH="${CONDA_PREFIX}/bin:${PATH}" - MPI_DEFAULTS="\ - --hostfile ${HOSTFILE} \ - -x CFLAGS \ - -x LDFLAGS \ - -x http_proxy \ - -x PYTHONUSERBASE \ - -x https_proxy \ - -x PATH \ - -x CUDA_DEVICE_MAX_CONNECTIONS \ - -x LD_LIBRARY_PATH" - MPI_ELASTIC="\ - -n ${NGPUS} \ - -npernode ${NGPU_PER_HOST}" +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' } -polarisMPI() { - NHOSTS=$(wc -l < "${PBS_NODEFILE}") - NGPU_PER_HOST=$(nvidia-smi -L | wc -l) - NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) - MPI_COMMAND=$(which mpiexec) - NVME_PATH="/local/scratch/" - MPI_DEFAULTS="\ - --envall \ - --verbose \ - --hostfile ${HOSTFILE}" - MPI_ELASTIC="\ - -n ${NGPUS} \ - --ppn ${NGPU_PER_HOST}" +HERE=$(WhereAmI) +# ALCF_DIR=$(find "${HERE}" -name "ALCF") +ALCF_DIR="${HERE}/ALCF" +PARENT=$(dirname "${ALCF_DIR}") + +function join_by { local d=${1-} f=${2-}; if shift 2; then printf %s "$f" "${@/#/$d}"; fi; } + +function setupVenv() { + VENV_DIR="$1" + # VENV_DIR="${PARENT}/venvs/perlmutter/torch2.0.1/" + if [[ -d "${VENV_DIR}" ]]; then + echo "Found venv at: ${VENV_DIR}" + source "${VENV_DIR}/bin/activate" + else + echo "Skipping setupVenv() on $(hostname)" + fi } -setupMPI() { - if [[ $(hostname) == theta* ]]; then - echo "Setting up MPI on ThetaGPU from $(hostname)" - thetagpuMPI - elif [[ $(hostname) == x* ]]; then - echo "Setting up MPI on Polaris from $(hostname)" - polarisMPI - else - echo "Unexpected hostname $(hostname)" - fi +function loadCondaEnv() { + if [[ "${CONDA_EXE}" ]]; then + echo "Already inside ${CONDA_EXE}, exiting!" + else + MODULE_STR="$1" + module load "conda/${MODULE_STR}" + conda activate base + fi } -condaThetaGPU220701() { - module load conda/2022-07-01 ; conda activate base - conda activate \ - /lus/grand/projects/datascience/foremans/locations/thetaGPU/miniconda3/envs/2022-07-01 - # if [[ -f "${PARENT}/.venvs/thetaGPU/2022-07-01-deepspeed/bin/activate" ]]; then - # echo "Found virtual environment!" - # source "${PARENT}/.venvs/thetaGPU/2022-07-01-deepspeed/bin/activate" - # fi +function thetagpuMPI() { + if [[ $(hostname) == theta* ]]; then + export HOSTFILE="${COBALT_NODEFILE}" + NHOSTS=$(wc -l < "${COBALT_NODEFILE}") + NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) + NVME_PATH="/raid/scratch/" + MPI_COMMAND=$(which mpirun) + # export PATH="${CONDA_PREFIX}/bin:${PATH}" + _MPI_DEFAULTS=( + "--hostfile ${HOSTFILE}" + "-x CFLAGS" + "-x LDFLAGS" + "-x http_proxy" + "-x PYTHONUSERBASE" + "-x https_proxy" + "-x PATH" + "-x CUDA_DEVICE_MAX_CONNECTIONS" + "-x LD_LIBRARY_PATH" + ) + _MPI_ELASTIC=( + "-n ${NGPUS}" + "-npernode ${NGPU_PER_HOST}" + ) + export MPI_DEFAULTS="$(join_by ' ' ${_MPI_DEFAULTS})" + export MPI_ELASTIC="$(join_by ' ' ${_MPI_ELASTIC})" + else + echo "Skipping thetaGPUMPI() on $(hostname)" + fi } -condaThetaGPU230111() { - module load conda/2023-01-11 ; conda activate base -# conda activate \ -# /lus/grand/projects/datascience/foremans/locations/thetaGPU/miniconda3/envs/2023-01-11-deepspeed - VENV_DIR="${PARENT}/venvs/thetaGPU/2023-01-11-deepspeed" - if [[ -d "${VENV_DIR}" ]] ; then - echo "Found venv at: ${VENV_DIR}" - # shellcheck source='../venvs/thetaGPU/2023-01-10/bin/activate' - source "${VENV_DIR}/bin/activate" - fi +function polarisMPI() { + if [[ $(hostname) == x3* ]]; then + export HOSTFILE="${PBS_NODEFILE}" + export NHOSTS=$(wc -l < "${PBS_NODEFILE}") + export NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + export NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) + export MPI_COMMAND=$(which mpiexec) + export NVME_PATH="/local/scratch/" + _MPI_DEFAULTS=( + "--envall" + "--verbose" + "--hostfile ${HOSTFILE}" + ) + _MPI_ELASTIC=( + "-n ${NGPUS}" + "--ppn ${NGPU_PER_HOST}" + ) + export MPI_DEFAULTS="$(join_by ' ' ${_MPI_DEFAULTS})" + export MPI_ELASTIC="$(join_by ' ' ${_MPI_ELASTIC})" + else + echo "Skipping polarisMPI() on $(hostname)" + fi } -condaThetaGPU() { - module load conda/2022-07-01 ; conda activate base - conda activate \ - /lus/grand/projects/datascience/foremans/locations/thetaGPU/miniconda3/envs/2022-07-01 - echo "USING PYTHON: $(which python3)" +function setupMPI() { + if [[ $(hostname) == theta* ]]; then + echo "Setting up MPI on ThetaGPU from $(hostname)" + thetagpuMPI + elif [[ $(hostname) == x* ]]; then + echo "Setting up MPI on Polaris from $(hostname)" + polarisMPI + else + echo "Skipping setupMPI() on hostname $(hostname)" + fi + echo "++ SetupMPI() +++++++++++++++++++++++++++++++++" + echo "Using HOSTFILE: $HOSTFILE" + echo "NHOSTS: ${NHOSTS}" + echo "NGPU_PER_HOST: ${NGPU_PER_HOST}" + echo "NGPUS: $NGPUS" + echo "+++++++++++++++++++++++++++++++++++++++++++++++" } -condaThetaGPU_mtanaka() { - # module load conda/2023-01-11 ; conda activate base - # conda activate \ - # /lus/grand/projects/datascience/foremans/locations/thetaGPU/miniconda3/envs/2023-01-11-deepspeed - VENV_DIR="/lus/grand/projects/datascience/mtanaka/dsseq/venv/dsseq" - if [[ -d "${VENV_DIR}" ]] ; then - echo "Found venv at: ${VENV_DIR}" - # shellcheck source='../venvs/thetaGPU/2023-01-10/bin/activate' - source "${VENV_DIR}/bin/activate" - fi +function condaPolaris() { + if [[ "$(hostname)" == x3* ]]; then + DATE_STR="2023-09-29" + [ "${CONDA_EXE}" ] || loadCondaEnv "${DATE_STR}" + [ "${VIRTUAL_ENV}" ] || setupVenv "${DATE_STR}" + else + echo "Skipping condaPolaris() on $(hostname)" + fi } -condaPolaris220908() { - echo "Loading: 'module load conda 2022-09-08 ; conda activate base'" - module load conda/2022-09-08 ; conda activate base - conda activate /lus/grand/projects/datascience/foremans/locations/polaris/miniconda3/envs/2022-09-08-deepspeed - export CFLAGS="-I${CONDA_PREFIX}/include" - export LDFLAGS="-L${CONDA_PREFIX}/lib" - VENV_DIR="${PARENT}/venvs/polaris/2022-09-08" - if [[ -d "${VENV_DIR}" ]]; then - echo "Found venv at: ${VENV_DIR}" - source "${VENV_DIR}/bin/activate" - fi +function condaThetaGPU() { + if [[ "$(hostname)" == theta* ]]; then + DATE_STR="2023-01-11" + [ "${CONDA_EXE}" ] || loadCondaEnv "${DATE_STR}" + [ "${VIRTUAL_ENV}" ] || setupVenv "${DATE_STR}" + else + echo "Skipping condaThetaGPU() on $(hostname)" + fi } -condaPolaris230110() { - echo "Loading: 'module load conda 2023-01-10-unstable ; conda activate base'" - module load conda/2023-01-10-unstable ; conda activate base - export CFLAGS="-I${CONDA_PREFIX}/include" - export LDFLAGS="-L${CONDA_PREFIX}/lib" - # conda activate \ - # /lus/grand/projects/datascience/foremans/locations/polaris/miniconda3/envs/2023-01-10 - VENV_DIR="${PARENT}/venvs/polaris/2023-01-10/" - if [[ -d "${VENV_DIR}" ]]; then - echo "Found venv at: ${VENV_DIR}" - # shellcheck source=../venvs/polaris/2023-01-10/bin/activate - source "${VENV_DIR}/bin/activate" - fi +function setupThetaGPU() { + export LAB="ALCF" + export MACHINE="ThetaGPU" + if [[ $(hostname) == theta* ]]; then + setupMPI + DATE_STR="2023-01-11" + [ "${CONDA_EXE}" ] || loadCondaEnv "${DATE_STR}" || echo "Caught CONDA_EXE: ${CONDA_EXE}" + [ "${VIRTUAL_ENV}" ] || setupVenv "${DATE_STR}" || echo "Caught VIRTUAL_ENV: ${VIRTUAL_ENV}" + else + echo "Skipping setupThetaGPU() on $(hostname)" + fi } -condaThetaGPU230426() { - echo "Loading: 'module load conda 2023-01-10-unstable ; conda activate base'" - module load conda/2023-01-11 - conda activate base - conda activate /lus/grand/projects/datascience/foremans/locations/thetaGPU/miniconda3/envs/2023-04-26 - VENV_DIR="${PARENT}/venvs/thetaGPU/2023-04-26/" - if [[ -d "${VENV_DIR}" ]]; then - echo "Found venv at: ${VENV_DIR}" - # shellcheck source=../venvs/thetaGPU/2023-04-26/ - source "${VENV_DIR}/bin/activate" - fi - thetagpuMPI - export CFLAGS="-I${CONDA_PREFIX}/include" - export LDFLAGS="-L${CONDA_PREFIX}/lib" +function setupPolaris() { + export LAB="ALCF" + export MACHINE="Polaris" + if [[ "$(hostname)" == x3* ]]; then + # SETUP MPI -------------------------------- + setupMPI + # SETUP Python -------------------------------- + DATE_STR="2023-09-29" + [ "${CONDA_EXE}" ] || loadCondaEnv "${DATE_STR}-unstable" || echo "Caught CONDA_EXE: ${CONDA_EXE}" + [ "${VIRTUAL_ENV}" ] || setupVenv "${DATE_STR}" || echo "Caught VIRTUAL_ENV: ${VIRTUAL_ENV}" + else + echo "Skipping setupPolaris() on $(hostname)" + fi } -condaPolaris() { - condaPolaris230110 - echo "USING PYTHON: $(which python3)" +function setupALCF() { + if [[ $(hostname) == theta* || $(hostname) == x3* ]]; then + setupMPI + [ "$(hostname)==theta*" ] && setupThetaGPU || echo "Skipping setupThetaGPU from $(hostname)" + [ "$(hostname)==x3*" ] && setupPolaris || echo "Skipping setupPolaris from $(hostname)" + else + echo "Skipping setupALCF() on $(hostname)" + fi } -# ┏━━━━━━━━━━┓ -# ┃ ThetaGPU ┃ -# ┗━━━━━━━━━━┛ -setupThetaGPU() { - if [[ $(hostname) == theta* ]]; then - export MACHINE="ThetaGPU" - HOSTFILE="${COBALT_NODEFILE}" - # -- Python / Conda setup ------------------------------------------------- - thetagpuMPI - condaThetaGPU230426 - else - echo "Unexpected hostname: $(hostname)" - fi +# ┏━━━━━━━┓ +# ┃ NERSC ┃ +# ┗━━━━━━━┛ +function setupPerlmutter() { + if [[ $(hostname) == login* || $(hostname) == nid* ]]; then + module load libfabric cudatoolkit pytorch/2.0.1 + if [[ $(hostname) == login* ]]; then + export MACHINE="NERSC" + module load pytorch/2.0.1 + export NHOSTS=1 + export NGPU_PER_HOST=1 + export NGPUS=1 + # echo "$(hostname)" > "${HERE}/hostfile" + elif [[ $(hostname) == nid* ]]; then + export NODELIST="${SLURM_JOB_NODELIST:-$(hostname)}" + export NODE_RANK=0 + export CUDA_DEVICE_MAX_CONNECTIONS=1 + export MACHINE="PERLMUTTER" + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + else + echo "Unexpected $(hostname) on NERSC" + fi + echo "+++++++++++++++++++++++++++++++++++" + echo "Using python: $(which python3)" + echo "+++++++++++++++++++++++++++++++++++" + else + echo "Skipping setupPerlmutter() on $(hostname)" + fi } -# ┏━━━━━━━━━┓ -# ┃ Polaris ┃ -# ┗━━━━━━━━━┛ -setupPolaris() { - if [[ $(hostname) == x* ]]; then - export MACHINE="Polaris" - HOSTFILE="${PBS_NODEFILE}" - # -- MPI / Comms Setup ---------------------------------------------------- - condaPolaris - polarisMPI - # export IBV_FORK_SAFE=1 - else - echo "Unexpected hostname: $(hostname)" - fi + +function setupMachine() { + HOSTNAME="$(hostname)" + if [[ $(hostname) == theta* || $(hostname) == x3* ]]; then + export LAB="ALCF" + setupALCF + [ "${HOSTNAME}==theta*" ] && condaThetaGPU + [ "${HOSTNAME}==x3*" ] && condaPolaris + elif [[ "${HOSTNAME}== nid*" || "${HOSTNAME}== login*" ]]; then + export LAB="NERSC" + setupPerlmutter + [ "${HOSTNAME}==login*" ] && setupPerlmutter + [ "${HOSTNAME}==nid*" ] && setupPerlmutter + else + echo "Unexpected hostname: $(hostname)" + fi } # ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ # ┃ SETUP CONDA + MPI ENVIRONMENT @ ALCF ┃ # ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ -setup() { - # unset PYTHONUSERBASE - export NCCL_DEBUG=warn - export WANDB_CACHE_DIR="./cache/wandb" - CFLAGS="-I${CONDA_PREFIX}/include/" - LDFLAGS="-L${CONDA_PREFIX}/lib/" - # export CFLAGS="${CFLAGS}" - # export LDFLAGS="${LDFLAGS}" - # export PATH="${CONDA_PREFIX}/bin:${PATH}" - - export NVME_PATH="${NVME_PATH}" - export MPI_DEFAULTS="${MPI_DEFAULTS}" - export MPI_ELASTIC="${MPI_ELASTIC}" - export MPI_COMMAND="${MPI_COMMAND}" - - PYTHON_EXECUTABLE="$(which python3)" - export PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" - echo "USING PYTHON: $(which python3)" - echo "CFLAGS: ${CFLAGS}" - echo "LDFLAGS: ${LDFLAGS}" - # source "${DIR}/args.sh" - - if [[ $(hostname) == theta* ]]; then - echo "Setting up ThetaGPU from $(hostname)" - setupThetaGPU - elif [[ $(hostname) == x* ]]; then - echo "Setting up Polaris from $(hostname)" - setupPolaris - else - echo "Unexpected hostname $(hostname)" - fi - export NODE_RANK=0 - # export RANK=0 - export NNODES=$NHOSTS - export GPUS_PER_NODE=$NGPU_PER_HOST - export WORLD_SIZE=$NGPUS - export NGPUS="${NGPUS}" - export NHOSTS="${NHOSTS}" - export NGPU_PER_HOST="${NGPU_PER_HOST}" - export CUDA_DEVICE_MAX_CONNECTIONS=1 +function setup() { + export NCCL_DEBUG=warn + # TORCH_EXTENSIONS_DIR="${HERE}/.cache/torch_extensions" + export WANDB_CACHE_DIR="./cache/wandb" + setupMachine + PYTHON_EXECUTABLE="$(which python3)" + export PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" + echo "USING PYTHON: $(which python3)" + # echo "CFLAGS: ${CFLAGS}" + # echo "LDFLAGS: ${LDFLAGS}" + # export NODE_RANK=0 + export NNODES=$NHOSTS + export GPUS_PER_NODE=$NGPU_PER_HOST + export WORLD_SIZE=$NGPUS + export NGPUS="${NGPUS}" + export NHOSTS="${NHOSTS}" + export NGPU_PER_HOST="${NGPU_PER_HOST}" + export CUDA_DEVICE_MAX_CONNECTIONS=1 + echo "########################################" + echo "NHOSTS: ${NHOSTS}" + echo "NGPU_PER_HOST: ${NGPU_PER_HOST}" + echo "NGPUS: (${NHOSTS} * ${NGPU_PER_HOST}) = ${NGPUS}" + echo "########################################" } + +setup diff --git a/ALCF/sweep_args_theta_multinode.sh b/ALCF/sweep_args_theta_multinode.sh deleted file mode 100755 index ac6409233a..0000000000 --- a/ALCF/sweep_args_theta_multinode.sh +++ /dev/null @@ -1,164 +0,0 @@ -#!/bin/bash -l - -module load conda/2023-01-11 -conda activate base -# cd /home/czh5/seq/Megatron-DS-Benchmarking/ALCF -# source /home/czh5/seq/Megatron-DS-Benchmarking/venvs/thetaGPU/2023-01-11-deepspeed/bin/activate - -# rm -rf /home/czh5/genome/Megatron-DeepSpeed/dataset/*.npy -# rm -rf /home/czh5/genome/Megatron-DeepSpeed/dataset/*.done -# -SCRIPT_PATH="${BASH_SOURCE[0]}" -while [ -L "$SCRIPT_PATH" ]; do - SCRIPT_DIR="$(cd -P "$(dirname "$SCRIPT_PATH")" >/dev/null 2>&1 && pwd)" - SCRIPT_PATH="$(readlink "$SCRIPT_PATH")" - [[ ${SCRIPT_PATH} != /* ]] && SCRIPT_PATH="${SCRIPT_DIR}/${SCRIPT_PATH}" -done -SCRIPT_PATH="$(readlink -f "$SCRIPT_PATH")" -SCRIPT_DIR="$(cd -P "$(dirname -- "$SCRIPT_PATH")" >/dev/null 2>&1 && pwd)" - -SOURCE=${BASH_SOURCE[0]} -while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink - DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) - SOURCE=$(readlink "$SOURCE") - [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located -done -DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" - - -function sourceFile() { - FILE="$1" - echo "source-ing ${FILE}" - if [[ -f "${FILE}" ]]; then - # shellcheck source="${FILE}" - source "${FILE}" - else - echo "ERROR: UNABLE TO SOURCE ${FILE}" - fi -} - -SETUP_FILE="${DIR}/setup.sh" -MODEL_FILE="${DIR}/model.sh" -ARGS_FILE="${DIR}/args.sh" -LAUNCH_FILE="${DIR}/launch.sh" - - -sourceFile "${SETUP_FILE}" -sourceFile "${MODEL_FILE}" -sourceFile "${ARGS_FILE}" -sourceFile "${LAUNCH_FILE}" - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -echo "My current script is: ${SCRIPT_DIR[0]}" - - -if [[ $(hostname) == theta* ]]; then - HOSTFILE="${COBALT_NODEFILE}" -elif [[ $(hostname) == x* ]]; then - HOSTFILE="${PBS_NODEFILE}" -else - echo "Unexpected hostname $(hostname)" -fi - -echo "Found hostfile: ${HOSTFILE}" - -NHOSTS=$(wc -l < "${HOSTFILE}") -NGPU_PER_HOST=$(nvidia-smi -L | wc -l) -PARALLEL_SIZE=$(( NHOSTS * NGPU_PER_HOST )) - -export MODEL_TYPE=${MODEL_TYPE:-"gpt"} # set bert or gpt -export SP_TYPE=${SP_TYPE:-"megatron"} # set ds or megatron - -K_VALS=( - # 2 - # 4 - 8 - # 16 - # 32 - # 64 - # 128 - # 192 - # 256 - # 272 - # 320 - # 384 - # 448 - # 512 - # 1024 -) - -SEQLEN_VALS=( -# 2048 -# 4096 -# 8192 -# 16384 -# 32768 -# 65536 -# 131072 -# 262144 -# 524288 -# 1048576 -# 2097152 -) - -MODEL_SIZE_VALS=( - # "GPT125M" - # "BERT1.2B" -# "GPT1_5B" - # "GPT2_7B" -# "GPT6_7B" - # "GPT13B" - "GPT25B" -# "GPT30B" -# "GPT33B" -) - -for MODEL_SIZE_KEY in "${MODEL_SIZE_VALS[@]}"; do - export MODEL_SIZE_KEY -# for SEQ_LEN in "${SEQLEN_VALS[@]}"; do -# export SEQ_LEN - for NUM_K in "${K_VALS[@]}"; do - # common_factor=$(( $PARALLEL_SIZE * 8 )) - # export SEQ_LEN=$(( 1024 * $NUM_K / $common_factor * $common_factor )) - - export SEQ_LEN=$(( 1024 * $NUM_K )) - - if [[ ${SP_TYPE} == "ds" ]]; then - echo "DS sequence parallel" - export SPSIZE=${PARALLEL_SIZE} - export MPSIZE=1 - export ZERO_STAGE=3 - export USE_SEQUENCE_PARALLEL=0 - bash ./benchmark_train.sh - fi - - if [[ ${SP_TYPE} == "megatron" ]]; then - echo "Megatron's sequence parallel" - - # if [ ${SEQ_LEN} -eq 8192 ]; then - # PARALLEL_SIZE=8 - # fi - - # if [ ${SEQ_LEN} -eq 16384 ]; then - # PARALLEL_SIZE=8 - # fi - - # if [ ${SEQ_LEN} -eq 32768 ]; then - # PARALLEL_SIZE=16 - # fi - - # if [ ${SEQ_LEN} -eq 65536 ]; then - # PARALLEL_SIZE=16 - # fi - - export SPSIZE=1 - export MPSIZE=${PARALLEL_SIZE} - export ZERO_STAGE=0 - export USE_SEQUENCE_PARALLEL=1 - bash ./benchmark_train.sh - fi - - printf "\n------------------------" - echo SEQ_LEN=${SEQ_LEN} - done -done diff --git a/ALCF/train-gpt3.sh b/ALCF/train-gpt3.sh index fa707b5df4..bb054b0386 100755 --- a/ALCF/train-gpt3.sh +++ b/ALCF/train-gpt3.sh @@ -10,7 +10,8 @@ function WhereAmI() { } HERE=$(WhereAmI) -ALCF_DIR=$(find "${HERE}" -name "ALCF") +# ALCF_DIR=$(find "${HERE}" -name "ALCF") +ALCF_DIR="${HERE}/ALCF" # ALCF_DIR="$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1)))/ALCF" @@ -30,7 +31,7 @@ echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" #┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ #┃ Make sure we're not already running; if so, exit here ┃ #┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ -PIDS=$(ps aux | grep pretrain_gpt.py | grep -v grep | awk '{print $2}') +PIDS=$(ps aux | egrep "$USER.+mpi.+pretrain_gpt.py" | grep -v grep | awk '{print $2}') if [ -n "${PIDS}" ]; then echo "Already running! Exiting!" exit 1 From 7fc2221d20890f1a3c7d1e0ca4f06164ff6de2f5 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 3 Oct 2023 23:45:20 -0500 Subject: [PATCH 003/268] Update `pretrain_gpt.py` --- pretrain_gpt.py | 224 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 211 insertions(+), 13 deletions(-) diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 149414848c..272f05318b 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -2,8 +2,11 @@ """Pretrain GPT""" +import os import torch import math +# import logging + from functools import partial from megatron import get_args from megatron import print_rank_0 @@ -18,25 +21,125 @@ from megatron.utils import get_ltor_masks_and_position_ids from megatron.utils import average_losses_across_data_parallel_group from megatron.arguments import core_transformer_config_from_args +from megatron.utils import ( + report_memory, + throughput_calculator, + checkpoint_throughput_calculator +) +from pathlib import Path import deepspeed from deepspeed.runtime.utils import see_memory_usage from deepspeed.accelerator.real_accelerator import get_accelerator -import os import subprocess +import wandb +import time from torch import nn import torch.nn.functional as F +# from ezpz import get_logger +from ezpz.dist import setup_torch, get_world_size + +RANK = setup_torch( + backend='deepspeed', + port='5432', +) +WORLD_SIZE = get_world_size() +LEVEL = "DEBUG" if RANK == 0 else "CRITICAL" + + +# os.environ[''] +# wblogger = logging.getLogger("wandb") +# wblogger.setLevel(logging.DEBUG) + +# log = get_logger(__name__, level=LEVEL) +# +# log.critical(f"Hello from rank: {RANK} / {WORLD_SIZE} !") + +import socket +from typing import Optional +# log.critical(f"Setting up W&B from rank: {RANK} with {wb_project_name}") + + +def setup_wandb(project_name: Optional[str] = None): + print(f"Setting up W&B from: {RANK}") + project_name = ( + os.environ.get('WB_PROJECT', 'GenSLM-Megatron-DS') + if project_name is None else project_name + ) + print(f"Setting up wandb from rank: {RANK}") + print(f"Using: WB PROJECT: {project_name}") + # if get_rank() == 0: + # tensorboard_dir = args.tensorboard_dir + tensorboard_dir = None + # if config is None: + tensorboard_dir = os.environ.get('TENSORBOARD_DIR', None) + # else: + # tensorboard_dir = ( + # config.get( + # 'tensorboard_dir', + # None, # os.getcwd() + # ) + # ) + if tensorboard_dir is not None: + print(f'Patching tensorboard from {tensorboard_dir}') + wandb.tensorboard.patch(root_logdir=tensorboard_dir) + # wbrun_id = wandb.util.generate_id() + current_time = time.time() + # local_time = time.localtime(current_time) + # if wandb.run is None: + wandb.init( + resume='allow', + sync_tensorboard=(tensorboard_dir is not None), # True, + project=(project_name if project_name is not None else None), + # dir=(tensorboard_dir if tensorboard_dir is not None else None), + ) + assert wandb.run is not None + print(f"W&B RUN: [{wandb.run.name}]({wandb.run.url})") + wandb.run.config.update({'current_time': current_time}) + model_size = os.environ.get('MODEL_SIZE', None) + wandb.run.config.update({'world_size': get_world_size()}) + # if config is not None: + # wandb.run.config.update(config) + env = { + k: v for k, v in dict(os.environ).items() + if not k.startswith('_ModuleTable') + } + _ = env.pop('LS_COLORS', None) + _ = env.pop('PS1', None) + wandb.run.config.update({'env': env}) + hostname = socket.gethostbyaddr(socket.gethostname())[0] + if hostname.startswith('theta'): + wandb.run.config.update({'machine': 'ThetaGPU'}) + elif hostname.startswith('x3'): + wandb.run.config.update({'machine': 'Polaris'}) + elif hostname.startswith('x1'): + wandb.run.config.update({'machine': 'Sunspot'}) + elif hostname.startswith('nid'): + wandb.run.config.update({'machine': 'Perlmutter'}) + elif hostname.startswith('login'): + wandb.run.config.update({'machine': 'NERSC'}) + else: + wandb.run.config.update({'machine': hostname}) + if model_size is not None: + wandb.run.config.update({'MODEL_SIZE': model_size}) + def model_provider(pre_process=True, post_process=True): """Build the model.""" - print_rank_0('building GPT model ...') - see_memory_usage(f"Before Building Model", force=True) - + see_memory_usage("Before Building Model", force=True) args = get_args() config = core_transformer_config_from_args(args) + # args = get_args() + # timers = get_timers() + if wandb.run is not None: + print(f"Updating WandB run: [{wandb.run.name}]({wandb.run.url})") + wandb.run.config.update({"args": vars(args)}) + if RANK == 0: + git_ds_info() + with deepspeed.zero.Init(sequence_data_parallel_group=mpu.get_sequence_data_parallel_group(), remote_device=None if args.remote_device == 'none' else args.remote_device, config_dict_or_path=args.deepspeed_config, @@ -92,7 +195,21 @@ def model_provider(pre_process=True, post_process=True): pre_process=pre_process, post_process=post_process ) - see_memory_usage(f"After Building Model", force=True) + num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + # print_rank_0('\n ------------------------ ') + # print_rank_0(f'num of parameters {num_params}') + # print_rank_0('------------------------\n ') + print_rank_0(80 * '-') + print_rank_0(f"Number of parameters in model: {num_params}") + print_rank_0(80 * '-') + see_memory_usage("After Building Model", force=True) + if wandb.run is not None: + wandb.run.watch( + model, + log='all', + log_graph=True, + ) + wandb.run.config.update({'num_params': num_params}) return model @@ -118,7 +235,8 @@ def get_batch(data_iterator): tokens = tokens_[:, :-1].contiguous() # Get the masks and postition ids. - skip_mask = args.use_flash_attn or args.use_flash_attn_triton + skip_mask = hasattr(args, 'use_flash_attn') or hasattr(args, 'flash_attn_triton') + # skip_mask = args.use_flash_attn or args.use_flash_attn_triton attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( tokens, tokenizer.eod, @@ -357,11 +475,91 @@ def git_ds_info(): print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') +def main(): + if RANK == 0: + setup_wandb() + + model = pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + data_post_process=data_post_process + ) + # # from megatron.training import get_model + # if wandb.run is not None: + # args = get_args() + # timers = get_timers() + # # model = get_model(model_provider, ModelType.encoder_or_decoder) + # elapsed_time = timers('interval-time').elapsed(barrier=True) + # total_iterations = os.environ.get( + # "TOTAL_ITERATIONS", + # (args.train_iters + args.eval_iters) + # ) + # seq_len = args.seq_length + # elapsed_time_per_iteration = elapsed_time / total_iterations + # if model is not None: + # samples_per_sec, tflops, approx_params_in_billions = throughput_calculator( + # model, + # args, + # elapsed_time, + # total_iterations, + # ) + # # Compute throughput. + # samples_per_sec_per_replica = samples_per_sec / args.data_parallel_size + # tokens_per_sec = samples_per_sec * seq_len + # tokens_per_sec_per_replica = tokens_per_sec / args.data_parallel_size + # sample_consumption_rate = args.consumed_train_samples / elapsed_time + # token_consumption_rate = args.consumed_train_tokens / elapsed_time + # # Tensorboard values. + # tdata = { + # # 'iteration': iteration, + # 'consumed_train_samples': args.consumed_train_samples, + # 'consumed_train_tokens': args.consumed_train_tokens, + # # 'learning_rate': learning_rate, + # # 'batch_size': batch_size, + # # 'loss_scale': loss_scale, + # # 'grad_norm': grad_norm, + # } + # # for key in loss_dict: + # # tdata[f'lm-loss/{key}'] = loss_dict[key] + # + # tdata = {f'train/{k}': v for k, v in tdata.items()} + # # if wbrun is not None and wbrun is wandb.run: + # if wandb.run is not None: + # wandb.run.log(tdata, commit=False) + # tput = { + # 'throughput/iteration-time': elapsed_time_per_iteration, # 1000 ms / s + # 'throughput/samples_per_sec': samples_per_sec, + # 'throughput/samples_per_sec_per_replica': samples_per_sec_per_replica, + # 'throughput/tokens_per_sec': tokens_per_sec, + # 'throughput/tokens_per_sec_per_replica': tokens_per_sec_per_replica, + # 'throughput/tflops': tflops, + # 'throughput/approx_params_in_billions': approx_params_in_billions, + # 'throughput/sample_consumption_rate': sample_consumption_rate, + # 'throughput/token_consumption_rate': token_consumption_rate, + # 'throughput/elapsed_ms_per_iteration': elapsed_time_per_iteration, + # } + # wandb.run.log(tput) + return model + + + if __name__ == "__main__": - git_ds_info() - pretrain(train_valid_test_datasets_provider, - model_provider, - ModelType.encoder_or_decoder, - forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, - data_post_process=data_post_process) + # git_ds_info() + # pretrain(train_valid_test_datasets_provider, + # model_provider, + # ModelType.encoder_or_decoder, + # forward_step, + # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + # data_post_process=data_post_process) + import sys + import deepspeed.comm as dist + model = main() + dist.log_summary() + if wandb.run is not None: + print(f"wandb.run.name: {wandb.run.name}") + print(f"wandb.run.url: {wandb.run.url}") + wandb.finish() + sys.exit() From 02cef749db7426e7a918b96147c5bd4a1a8e4228 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 10 Oct 2023 18:59:07 -0500 Subject: [PATCH 004/268] Update `ALCF/launch.sh` --- ALCF/launch.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ALCF/launch.sh b/ALCF/launch.sh index c61a0e22ce..dc7a08d991 100755 --- a/ALCF/launch.sh +++ b/ALCF/launch.sh @@ -117,6 +117,8 @@ elasticDistributed() { NHOSTS=$(wc -l < "${HOSTFILE}") NGPU_PER_HOST=$(nvidia-smi -L | wc -l) NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + export MASTER_ADDR="127.0.0.1" + export MASTER_PORT="5432" EXEC_STR=( "${MPI_COMMAND}" "${MPI_DEFAULTS}" From cb9397b361947847209be0128d3ec02deec9a214 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 10 Oct 2023 18:59:21 -0500 Subject: [PATCH 005/268] Update `ALCF/setup.sh` --- ALCF/setup.sh | 78 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 29 deletions(-) diff --git a/ALCF/setup.sh b/ALCF/setup.sh index b3b135c513..b8918df147 100755 --- a/ALCF/setup.sh +++ b/ALCF/setup.sh @@ -41,23 +41,36 @@ function thetagpuMPI() { NVME_PATH="/raid/scratch/" MPI_COMMAND=$(which mpirun) # export PATH="${CONDA_PREFIX}/bin:${PATH}" - _MPI_DEFAULTS=( - "--hostfile ${HOSTFILE}" - "-x CFLAGS" - "-x LDFLAGS" - "-x http_proxy" - "-x PYTHONUSERBASE" - "-x https_proxy" - "-x PATH" - "-x CUDA_DEVICE_MAX_CONNECTIONS" - "-x LD_LIBRARY_PATH" - ) - _MPI_ELASTIC=( - "-n ${NGPUS}" - "-npernode ${NGPU_PER_HOST}" - ) - export MPI_DEFAULTS="$(join_by ' ' ${_MPI_DEFAULTS})" - export MPI_ELASTIC="$(join_by ' ' ${_MPI_ELASTIC})" + MPI_DEFAULTS="\ + --hostfile ${HOSTFILE} \ + -x CFLAGS \ + -x LDFLAGS \ + -x http_proxy \ + -x CUDA_DEVICE_MAX_CONNECTIONS \ + -x PYTHONUSERBASE \ + -x https_proxy \ + -x PATH \ + -x LD_LIBRARY_PATH" + MPI_ELASTIC="\ + -n ${NGPUS} \ + -npernode ${NGPU_PER_HOST}" + # _MPI_DEFAULTS=( + # "--hostfile ${HOSTFILE}" + # "-x CFLAGS" + # "-x LDFLAGS" + # "-x http_proxy" + # "-x PYTHONUSERBASE" + # "-x https_proxy" + # "-x PATH" + # "-x CUDA_DEVICE_MAX_CONNECTIONS" + # "-x LD_LIBRARY_PATH" + # ) + # _MPI_ELASTIC=( + # "-n ${NGPUS}" + # "-npernode ${NGPU_PER_HOST}" + # ) + # export MPI_DEFAULTS="$(join_by ' ' ${_MPI_DEFAULTS})" + # export MPI_ELASTIC="$(join_by ' ' ${_MPI_ELASTIC})" else echo "Skipping thetaGPUMPI() on $(hostname)" fi @@ -71,17 +84,24 @@ function polarisMPI() { export NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) export MPI_COMMAND=$(which mpiexec) export NVME_PATH="/local/scratch/" - _MPI_DEFAULTS=( - "--envall" - "--verbose" - "--hostfile ${HOSTFILE}" - ) - _MPI_ELASTIC=( - "-n ${NGPUS}" - "--ppn ${NGPU_PER_HOST}" - ) - export MPI_DEFAULTS="$(join_by ' ' ${_MPI_DEFAULTS})" - export MPI_ELASTIC="$(join_by ' ' ${_MPI_ELASTIC})" + MPI_DEFAULTS="\ + --envall \ + --verbose \ + --hostfile ${HOSTFILE}" + MPI_ELASTIC="\ + -n ${NGPUS} \ + --ppn ${NGPU_PER_HOST}" + # _MPI_DEFAULTS=( + # "--envall" + # "--verbose" + # "--hostfile ${HOSTFILE}" + # ) + # _MPI_ELASTIC=( + # "-n ${NGPUS}" + # "--ppn ${NGPU_PER_HOST}" + # ) + # export MPI_DEFAULTS="$(join_by ' ' ${_MPI_DEFAULTS})" + # export MPI_ELASTIC="$(join_by ' ' ${_MPI_ELASTIC})" else echo "Skipping polarisMPI() on $(hostname)" fi @@ -107,7 +127,7 @@ function setupMPI() { function condaPolaris() { if [[ "$(hostname)" == x3* ]]; then - DATE_STR="2023-09-29" + DATE_STR="2023-10-04" [ "${CONDA_EXE}" ] || loadCondaEnv "${DATE_STR}" [ "${VIRTUAL_ENV}" ] || setupVenv "${DATE_STR}" else From 36c4c79490af081bdd5a89353f0de31238ee1aab Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 11 Oct 2023 07:28:22 -0500 Subject: [PATCH 006/268] Move to public dataset --- ALCF/args.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ALCF/args.sh b/ALCF/args.sh index 200fd21fc9..058e8f5df5 100755 --- a/ALCF/args.sh +++ b/ALCF/args.sh @@ -146,8 +146,10 @@ echo "--------------------------------" # ┗━━━━━━━━━━━━┛ [ "$(hostname)==login*" ] && MEGATRON_DIR="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" [ "$(hostname)==nid*" ] && MEGATRON_DIR="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" -[ "$(hostname)==theta*" ] && MEGATRON_DIR="/lus/grand/projects/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DS-Benchmarking" -[ "$(hostname)==x3*" ] && MEGATRON_DIR="/lus/grand/projects/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DS-Benchmarking" +[ "$(hostname)==theta*" ] && MEGATRON_DIR="/lus/eagle/projects/datasets/BookCorpusDataset" +# "/lus/grand/projects/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DS-Benchmarking" +[ "$(hostname)==x3*" ] && MEGATRON_DIR="/lus/eagle/projects/datasets/BookCorpusDataset" +# /lus/grand/projects/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DS-Benchmarking" # DATA_PATH=/lus/grand/projects/datascience/vsastry/genslm_subsample_200k_sequence_document/genslm_subsample_200k_sequence_document DATA_DIR="${MEGATRON_DIR}/dataset" From 4709afc3616f5e4d1ab454f3e697ff27a3d4e46e Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 11 Oct 2023 21:01:33 -0500 Subject: [PATCH 007/268] Update `ALCF/args.sh` --- ALCF/args.sh | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/ALCF/args.sh b/ALCF/args.sh index 058e8f5df5..ad929532d0 100755 --- a/ALCF/args.sh +++ b/ALCF/args.sh @@ -144,16 +144,20 @@ echo "--------------------------------" # ┏━━━━━━━━━━━━┓ # ┃ Data paths ┃ # ┗━━━━━━━━━━━━┛ -[ "$(hostname)==login*" ] && MEGATRON_DIR="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" -[ "$(hostname)==nid*" ] && MEGATRON_DIR="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" -[ "$(hostname)==theta*" ] && MEGATRON_DIR="/lus/eagle/projects/datasets/BookCorpusDataset" +[ "$(hostname)==login*" ] && DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" +[ "$(hostname)==nid*" ] && DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" +# [ "$(hostname)==theta*" ] && DATA_PARENT="/lus/eagle/projects/datasets/BookCorpusDataset" +[ "$(hostname)==theta*" ] && DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" +[ "$(hostname)==x3*" ] && DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" # "/lus/grand/projects/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DS-Benchmarking" -[ "$(hostname)==x3*" ] && MEGATRON_DIR="/lus/eagle/projects/datasets/BookCorpusDataset" +# /lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k/dataset/genslm_subsample_200k_sequence_document" +# [ "$(hostname)==x3*" ] && DATA_PARENT="/lus/eagle/projects/datasets/BookCorpusDataset" # /lus/grand/projects/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DS-Benchmarking" +# /lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k/dataset/genslm_subsample_200k_sequence_document.bin # DATA_PATH=/lus/grand/projects/datascience/vsastry/genslm_subsample_200k_sequence_document/genslm_subsample_200k_sequence_document -DATA_DIR="${MEGATRON_DIR}/dataset" -DATA_PATH="${DATA_DIR}/BookCorpusDataset_text_document" +DATA_DIR="${DATA_PARENT}/dataset" +DATA_PATH="${DATA_DIR}/genslm_subsample_200k_sequence_document" VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" From 3e603c680579b5cf09b5fef4b858598b93191cf3 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sun, 15 Oct 2023 14:23:44 -0500 Subject: [PATCH 008/268] Update `ALCF/args.sh` --- ALCF/args.sh | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/ALCF/args.sh b/ALCF/args.sh index ad929532d0..3740cc067a 100755 --- a/ALCF/args.sh +++ b/ALCF/args.sh @@ -60,8 +60,11 @@ export MICRO_BATCH=${MICRO_BATCH:-1} export GRADIENT_ACCUMULATION_STEPS=${GAS:-1} export MODEL_TYPE=${MODEL_TYPE:-"gpt"} # set bert or gpt export SP_TYPE=${SP_TYPE:-"megatron"} # set ds or megatron +export ZERO_STAGE=${ZERO_STAGE:-1} +export MPSIZE=${MPSIZE:-${WORLD_SIZE:-1}} +export SPSIZE=${SPSIZE:-1} - +# # Deal with Sequence Parallel implementation --------------------------------------- # ---------------------------------------------------------------------------------- if [[ ${SP_TYPE} == "ds" ]]; then @@ -100,6 +103,12 @@ elif [[ ${SP_TYPE} == "megatron" ]]; then [ "$MPSIZE" ] && echo "Caught MPSIZE: ${MPSIZE} from env" || MPSIZE="${WORLD_SIZE}" [ "$ZERO_STAGE" ] && echo "Caught ${ZERO_STAGE} from env" || ZERO_STAGE=0 [ "$USE_SEQUENCE_PARALLEL" ] && echo "Caught USE_SP: $USE_SEQUENCE_PARALLEL from env" || USE_SEQUENCE_PARALLEL=1 + if [[ ${PPSIZE} > 1 ]]; then # && ${MPSIZE}==${WORLD_SIZE} ]]; + MPSIZE=$(( WORLD_SIZE / PPSIZE )) + echo "Re-setting MPSIZE to ${WORLD_SIZE} / ${PPSIZE} = $(( WORLD_SIZE / PPSIZE ))" + echo "MPSIZE: $MPSIZE" + # MPSIZE="${WORLD_SIZE}/" + fi export SPSIZE="${SPSIZE}" export MPSIZE="${MPSIZE}" export ZERO_STAGE="${ZERO_STAGE}" @@ -109,7 +118,7 @@ else # exit 1 fi # ------------------------------------------------------------------------ - +# echo "####################################################" echo "USING: ${SP_TYPE}" echo "SPSIZE: ${SPSIZE}" @@ -137,8 +146,14 @@ fi # [ "${GLOBAL_BATCH:-${GLOBAL_BATCH}}" == 0 ] && GLOBAL_BATCH=1 || echo "GLOBAL_BATCH: ${GLOBAL_BATCH}" export GLOBAL_BATCH="$GLOBAL_BATCH" +DPSIZE=$(( $WORLD_SIZE / $PPSIZE / $MPSIZE )) + +# echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" +# echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + echo "--------------------------------" echo "GLOBAL_BATCH=${GLOBAL_BATCH}" +echo "USING DPSIZE: ${DPSIZE}" echo "--------------------------------" # ┏━━━━━━━━━━━━┓ @@ -194,7 +209,7 @@ fi if [[ $DDP_IMPL == 'FSDP' ]]; then RUN_STR="FSDP_${RUN_STR}" fi -if [[ $USE_ACTIVATION_CHECKPOINTING == 1 ]] ;then +if [[ $USE_ACTIVATION_CHECKPOINTING == 1 ]]; then RUN_STR="actCkpt_${RUN_STR}" fi if [[ $USE_SEQUENCE_PARALLEL == 1 ]] ; then From a8454935fabdbe1e88dcba54204e61a509ec7c56 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sun, 15 Oct 2023 14:26:49 -0500 Subject: [PATCH 009/268] Update `ALCF/launch.sh` --- ALCF/launch.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ALCF/launch.sh b/ALCF/launch.sh index dc7a08d991..affab7073e 100755 --- a/ALCF/launch.sh +++ b/ALCF/launch.sh @@ -103,7 +103,7 @@ fullNode() { # ┃ Use all available GPUs on all available nodes ┃ # ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ elasticDistributed() { - if [[ "$(hostname)==theta*" || "$(hostname)==x3*" ]]; then + if [[ $(hostname) == theta* || $(hostname) == x3* ]]; then if [[ $(hostname) == theta* ]]; then echo "Setting up ThetaGPU from $(hostname)" HOSTFILE="${COBALT_NODEFILE}" @@ -128,10 +128,10 @@ elasticDistributed() { "${gpt_args}" "${ds_args}" ) - elif [[ "$(hostname)==nid*" || "$(hostname)==login*" ]]; then + elif [[ $(hostname) == nid* || $(hostname) == login* ]]; then echo "Setting up from Perlmutter on $(hostname)" - [ "$(hostname)==nid*" ] && NHOSTS="$SLURM_NNODES" || NHOSTS=1 - [ "$(hostname)==nid*" ] && export MACHINE="perlmutter" || export MACHINE="NERSC" + [ $(hostname) == nid* ] && NHOSTS="$SLURM_NNODES" || NHOSTS=1 + [ $(hostname) == nid* ] && export MACHINE="perlmutter" || export MACHINE="NERSC" NGPU_PER_HOST=$(nvidia-smi -L | wc -l) # NGPU_PER_HOST="$SLURM_GPUS_ON_NODE" NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" From f5c8f991443dc9b9cb55fb3dc6c7edf26c517c38 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sun, 15 Oct 2023 14:27:14 -0500 Subject: [PATCH 010/268] Update `ALCF/setup.sh` --- ALCF/setup.sh | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/ALCF/setup.sh b/ALCF/setup.sh index b8918df147..94416982b8 100755 --- a/ALCF/setup.sh +++ b/ALCF/setup.sh @@ -126,7 +126,7 @@ function setupMPI() { } function condaPolaris() { - if [[ "$(hostname)" == x3* ]]; then + if [[ $(hostname) == x3* ]]; then DATE_STR="2023-10-04" [ "${CONDA_EXE}" ] || loadCondaEnv "${DATE_STR}" [ "${VIRTUAL_ENV}" ] || setupVenv "${DATE_STR}" @@ -136,10 +136,12 @@ function condaPolaris() { } function condaThetaGPU() { - if [[ "$(hostname)" == theta* ]]; then + if [[ $(hostname) == theta* ]]; then DATE_STR="2023-01-11" - [ "${CONDA_EXE}" ] || loadCondaEnv "${DATE_STR}" - [ "${VIRTUAL_ENV}" ] || setupVenv "${DATE_STR}" + [ "${CONDA_EXE}" ] && echo "Caught CONDA_EXE: ${CONDA_EXE}" || loadCondaEnv "${DATE_STR}" + [ "${VIRTUAL_ENV}" ] && echo "Caught VIRTUAL_ENV: ${VIRTUAL_ENV}" || setupVenv "${DATE_STR}" + # [ "${CONDA_EXE}" ] || loadCondaEnv "${DATE_STR}" + # [ "${VIRTUAL_ENV}" ] || setupVenv "${DATE_STR}" else echo "Skipping condaThetaGPU() on $(hostname)" fi @@ -151,8 +153,8 @@ function setupThetaGPU() { if [[ $(hostname) == theta* ]]; then setupMPI DATE_STR="2023-01-11" - [ "${CONDA_EXE}" ] || loadCondaEnv "${DATE_STR}" || echo "Caught CONDA_EXE: ${CONDA_EXE}" - [ "${VIRTUAL_ENV}" ] || setupVenv "${DATE_STR}" || echo "Caught VIRTUAL_ENV: ${VIRTUAL_ENV}" + [ "${CONDA_EXE}" ] && echo "Caught CONDA_EXE: ${CONDA_EXE}" || loadCondaEnv "${DATE_STR}" + [ "${VIRTUAL_ENV}" ] && echo "Caught VIRTUAL_ENV: ${VIRTUAL_ENV}" || setupVenv "${DATE_STR}" else echo "Skipping setupThetaGPU() on $(hostname)" fi @@ -161,23 +163,24 @@ function setupThetaGPU() { function setupPolaris() { export LAB="ALCF" export MACHINE="Polaris" - if [[ "$(hostname)" == x3* ]]; then + if [[ $(hostname) == x3* ]]; then # SETUP MPI -------------------------------- setupMPI # SETUP Python -------------------------------- DATE_STR="2023-09-29" - [ "${CONDA_EXE}" ] || loadCondaEnv "${DATE_STR}-unstable" || echo "Caught CONDA_EXE: ${CONDA_EXE}" - [ "${VIRTUAL_ENV}" ] || setupVenv "${DATE_STR}" || echo "Caught VIRTUAL_ENV: ${VIRTUAL_ENV}" + [ "${CONDA_EXE}" ] && echo "Caught CONDA_EXE: ${CONDA_EXE}" || loadCondaEnv "${DATE_STR}-unstable" + [ "${VIRTUAL_ENV}" ] && echo "Caught VIRTUAL_ENV: ${VIRTUAL_ENV}" || setupVenv "${DATE_STR}" else echo "Skipping setupPolaris() on $(hostname)" fi } function setupALCF() { + # if [[ $(hostname -s) == theta* || $(hostname -s) == x3* ]]; then echo "True" ; else echo "False" ; fi if [[ $(hostname) == theta* || $(hostname) == x3* ]]; then setupMPI - [ "$(hostname)==theta*" ] && setupThetaGPU || echo "Skipping setupThetaGPU from $(hostname)" - [ "$(hostname)==x3*" ] && setupPolaris || echo "Skipping setupPolaris from $(hostname)" + [ $(hostname) == theta* ] && setupThetaGPU || echo "Skipping setupThetaGPU from $(hostname)" + [ $(hostname) == x3* ] && setupPolaris || echo "Skipping setupPolaris from $(hostname)" else echo "Skipping setupALCF() on $(hostname)" fi @@ -223,7 +226,7 @@ function setupMachine() { setupALCF [ "${HOSTNAME}==theta*" ] && condaThetaGPU [ "${HOSTNAME}==x3*" ] && condaPolaris - elif [[ "${HOSTNAME}== nid*" || "${HOSTNAME}== login*" ]]; then + elif [[ $(hostname) == nid* || $(hostname) == login* ]]; then export LAB="NERSC" setupPerlmutter [ "${HOSTNAME}==login*" ] && setupPerlmutter From 64fd153e0b26e0f78417bc1f617fa872f9c3c208 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sun, 15 Oct 2023 14:05:56 -0700 Subject: [PATCH 011/268] Update `pretrain_gpt.py` --- pretrain_gpt.py | 143 ++++++++++++++++++++++++++---------------------- 1 file changed, 78 insertions(+), 65 deletions(-) diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 272f05318b..63b46ed0d9 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -39,7 +39,7 @@ import torch.nn.functional as F # from ezpz import get_logger -from ezpz.dist import setup_torch, get_world_size +from ezpz.dist import setup_torch, get_world_size, setup_wandb RANK = setup_torch( backend='deepspeed', @@ -48,6 +48,19 @@ WORLD_SIZE = get_world_size() LEVEL = "DEBUG" if RANK == 0 else "CRITICAL" +if RANK == 0: + print(f"Setting up W&B from: {RANK}") + project_name = ( + os.environ.get( + 'WB_PROJECT', + os.environ.get( + 'WANDB_PROJECT', + 'GenSLM-Megatron-DS' + ), + ) + ) + setup_wandb(project_name=project_name) + # os.environ[''] # wblogger = logging.getLogger("wandb") @@ -62,68 +75,68 @@ # log.critical(f"Setting up W&B from rank: {RANK} with {wb_project_name}") -def setup_wandb(project_name: Optional[str] = None): - print(f"Setting up W&B from: {RANK}") - project_name = ( - os.environ.get('WB_PROJECT', 'GenSLM-Megatron-DS') - if project_name is None else project_name - ) - print(f"Setting up wandb from rank: {RANK}") - print(f"Using: WB PROJECT: {project_name}") - # if get_rank() == 0: - # tensorboard_dir = args.tensorboard_dir - tensorboard_dir = None - # if config is None: - tensorboard_dir = os.environ.get('TENSORBOARD_DIR', None) - # else: - # tensorboard_dir = ( - # config.get( - # 'tensorboard_dir', - # None, # os.getcwd() - # ) - # ) - if tensorboard_dir is not None: - print(f'Patching tensorboard from {tensorboard_dir}') - wandb.tensorboard.patch(root_logdir=tensorboard_dir) - # wbrun_id = wandb.util.generate_id() - current_time = time.time() - # local_time = time.localtime(current_time) - # if wandb.run is None: - wandb.init( - resume='allow', - sync_tensorboard=(tensorboard_dir is not None), # True, - project=(project_name if project_name is not None else None), - # dir=(tensorboard_dir if tensorboard_dir is not None else None), - ) - assert wandb.run is not None - print(f"W&B RUN: [{wandb.run.name}]({wandb.run.url})") - wandb.run.config.update({'current_time': current_time}) - model_size = os.environ.get('MODEL_SIZE', None) - wandb.run.config.update({'world_size': get_world_size()}) - # if config is not None: - # wandb.run.config.update(config) - env = { - k: v for k, v in dict(os.environ).items() - if not k.startswith('_ModuleTable') - } - _ = env.pop('LS_COLORS', None) - _ = env.pop('PS1', None) - wandb.run.config.update({'env': env}) - hostname = socket.gethostbyaddr(socket.gethostname())[0] - if hostname.startswith('theta'): - wandb.run.config.update({'machine': 'ThetaGPU'}) - elif hostname.startswith('x3'): - wandb.run.config.update({'machine': 'Polaris'}) - elif hostname.startswith('x1'): - wandb.run.config.update({'machine': 'Sunspot'}) - elif hostname.startswith('nid'): - wandb.run.config.update({'machine': 'Perlmutter'}) - elif hostname.startswith('login'): - wandb.run.config.update({'machine': 'NERSC'}) - else: - wandb.run.config.update({'machine': hostname}) - if model_size is not None: - wandb.run.config.update({'MODEL_SIZE': model_size}) +# def setup_wandb(project_name: Optional[str] = None): +# print(f"Setting up W&B from: {RANK}") +# project_name = ( +# os.environ.get('WB_PROJECT', 'GenSLM-Megatron-DS') +# if project_name is None else project_name +# ) +# print(f"Setting up wandb from rank: {RANK}") +# print(f"Using: WB PROJECT: {project_name}") +# # if get_rank() == 0: +# # tensorboard_dir = args.tensorboard_dir +# tensorboard_dir = None +# # if config is None: +# tensorboard_dir = os.environ.get('TENSORBOARD_DIR', None) +# # else: +# # tensorboard_dir = ( +# # config.get( +# # 'tensorboard_dir', +# # None, # os.getcwd() +# # ) +# # ) +# if tensorboard_dir is not None: +# print(f'Patching tensorboard from {tensorboard_dir}') +# wandb.tensorboard.patch(root_logdir=tensorboard_dir) +# # wbrun_id = wandb.util.generate_id() +# current_time = time.time() +# # local_time = time.localtime(current_time) +# # if wandb.run is None: +# wandb.init( +# resume='allow', +# sync_tensorboard=(tensorboard_dir is not None), # True, +# project=(project_name if project_name is not None else None), +# # dir=(tensorboard_dir if tensorboard_dir is not None else None), +# ) +# assert wandb.run is not None +# print(f"W&B RUN: [{wandb.run.name}]({wandb.run.url})") +# wandb.run.config.update({'current_time': current_time}) +# model_size = os.environ.get('MODEL_SIZE', None) +# wandb.run.config.update({'world_size': get_world_size()}) +# # if config is not None: +# # wandb.run.config.update(config) +# env = { +# k: v for k, v in dict(os.environ).items() +# if not k.startswith('_ModuleTable') +# } +# _ = env.pop('LS_COLORS', None) +# _ = env.pop('PS1', None) +# wandb.run.config.update({'env': env}) +# hostname = socket.gethostbyaddr(socket.gethostname())[0] +# if hostname.startswith('theta'): +# wandb.run.config.update({'machine': 'ThetaGPU'}) +# elif hostname.startswith('x3'): +# wandb.run.config.update({'machine': 'Polaris'}) +# elif hostname.startswith('x1'): +# wandb.run.config.update({'machine': 'Sunspot'}) +# elif hostname.startswith('nid'): +# wandb.run.config.update({'machine': 'Perlmutter'}) +# elif hostname.startswith('login'): +# wandb.run.config.update({'machine': 'NERSC'}) +# else: +# wandb.run.config.update({'machine': hostname}) +# if model_size is not None: +# wandb.run.config.update({'MODEL_SIZE': model_size}) def model_provider(pre_process=True, post_process=True): @@ -476,8 +489,8 @@ def git_ds_info(): def main(): - if RANK == 0: - setup_wandb() + # if RANK == 0: + # setup_wandb() model = pretrain( train_valid_test_datasets_provider, From a2fabf2e8d3b628ec0f7761b90293dcc83097d96 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sun, 15 Oct 2023 15:01:50 -0700 Subject: [PATCH 012/268] Clean up data paths in `ALCF/args.sh` --- ALCF/args.sh | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/ALCF/args.sh b/ALCF/args.sh index 3740cc067a..fad650b1dd 100755 --- a/ALCF/args.sh +++ b/ALCF/args.sh @@ -159,22 +159,39 @@ echo "--------------------------------" # ┏━━━━━━━━━━━━┓ # ┃ Data paths ┃ # ┗━━━━━━━━━━━━┛ -[ "$(hostname)==login*" ] && DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" -[ "$(hostname)==nid*" ] && DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" -# [ "$(hostname)==theta*" ] && DATA_PARENT="/lus/eagle/projects/datasets/BookCorpusDataset" -[ "$(hostname)==theta*" ] && DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" -[ "$(hostname)==x3*" ] && DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" -# "/lus/grand/projects/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DS-Benchmarking" -# /lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k/dataset/genslm_subsample_200k_sequence_document" -# [ "$(hostname)==x3*" ] && DATA_PARENT="/lus/eagle/projects/datasets/BookCorpusDataset" -# /lus/grand/projects/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DS-Benchmarking" -# /lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k/dataset/genslm_subsample_200k_sequence_document.bin - -# DATA_PATH=/lus/grand/projects/datascience/vsastry/genslm_subsample_200k_sequence_document/genslm_subsample_200k_sequence_document +if [[ $(hostname) == nid* || $(hostname) == login* ]]; then + DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" + DATA_TYPE="BookCorpusDataset_text_document" +elif [[ $(hostname) == theta* || $(hostname) == x3* ]]; then + DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" + DATA_TYPE="genslm_subsample_200k_sequence_document" +else + echo "Unable to determine DATA_PARENT for $(hostname)." + echo "Exiting!" + exit 1 +fi + DATA_DIR="${DATA_PARENT}/dataset" -DATA_PATH="${DATA_DIR}/genslm_subsample_200k_sequence_document" +DATA_PATH="${DATA_DIR}/${DATA_TYPE}" VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" +# # +# [ "$(hostname)==login*" ] && DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" +# [ "$(hostname)==nid*" ] && DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" +# # [ "$(hostname)==theta*" ] && DATA_PARENT="/lus/eagle/projects/datasets/BookCorpusDataset" +# [ "$(hostname)==theta*" ] && DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" +# [ "$(hostname)==x3*" ] && DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" +# # "/lus/grand/projects/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DS-Benchmarking" +# # /lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k/dataset/genslm_subsample_200k_sequence_document" +# # [ "$(hostname)==x3*" ] && DATA_PARENT="/lus/eagle/projects/datasets/BookCorpusDataset" +# # /lus/grand/projects/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DS-Benchmarking" +# # /lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k/dataset/genslm_subsample_200k_sequence_document.bin +# +# # DATA_PATH=/lus/grand/projects/datascience/vsastry/genslm_subsample_200k_sequence_document/genslm_subsample_200k_sequence_document +# DATA_DIR="${DATA_PARENT}/dataset" +# DATA_PATH="${DATA_DIR}/genslm_subsample_200k_sequence_document" +# VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" +# MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" # ┏━━━━━━━━━━━━━━━━━━━┓ # ┃ FILE I/O SETTINGS ┃ From 416fb9cb0da3d6f09f4184dbcf0f70461de4f15f Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sun, 15 Oct 2023 15:02:19 -0700 Subject: [PATCH 013/268] Updating launch vars in `ALCF/launch.sh` on Perlmutter --- ALCF/launch.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ALCF/launch.sh b/ALCF/launch.sh index affab7073e..ebd1b43f1e 100755 --- a/ALCF/launch.sh +++ b/ALCF/launch.sh @@ -130,8 +130,10 @@ elasticDistributed() { ) elif [[ $(hostname) == nid* || $(hostname) == login* ]]; then echo "Setting up from Perlmutter on $(hostname)" - [ $(hostname) == nid* ] && NHOSTS="$SLURM_NNODES" || NHOSTS=1 - [ $(hostname) == nid* ] && export MACHINE="perlmutter" || export MACHINE="NERSC" + NHOSTS=${SLURM_NNODES-1} + MACHINE="Perlmutter" + # [ $(hostname) == nid* ] && NHOSTS="$SLURM_NNODES" || NHOSTS=1 + # [ $(hostname) == nid* ] && export MACHINE="perlmutter" || export MACHINE="NERSC" NGPU_PER_HOST=$(nvidia-smi -L | wc -l) # NGPU_PER_HOST="$SLURM_GPUS_ON_NODE" NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" From 3cd7eccc9df9817874824b3fbd232385c1df3d08 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sun, 15 Oct 2023 18:50:33 -0700 Subject: [PATCH 014/268] Create `Perlmutter` branch Changes: - `ALCF/args.sh` - `ALCF/launch.sh` - `ALCF/setup.sh` to simplify setup and launching with `srun` on Perlmutter @ NERSC --- ALCF/args.sh | 1 + ALCF/launch.sh | 43 +++++++++++++++++++++++++++++-------- ALCF/setup.sh | 57 +++++++++++++++++++++++++++++++------------------- 3 files changed, 71 insertions(+), 30 deletions(-) diff --git a/ALCF/args.sh b/ALCF/args.sh index fad650b1dd..62b4c6494f 100755 --- a/ALCF/args.sh +++ b/ALCF/args.sh @@ -503,6 +503,7 @@ elif [[ "${USE_FLASH_ATTN_TRITON}" ]]; then fi if [[ "$USE_SEQUENCE_PARALLEL" == 1 ]]; then + export CUDA_DEVICE_MAX_CONNECTIONS=1 gpt_args+=( "--sequence-parallel" ) diff --git a/ALCF/launch.sh b/ALCF/launch.sh index ebd1b43f1e..5790727f7d 100755 --- a/ALCF/launch.sh +++ b/ALCF/launch.sh @@ -99,6 +99,21 @@ fullNode() { launchJob "$@" 2>&1 | tee "${OUTPUT_LOG}" } + +function setupSrun() { + if [[ $(hostname) == login* || $(hostname) == nid* ]]; then + export NODELIST="${SLURM_JOB_NODELIST:-$(hostname)}" + export MACHINE="Perlmutter" + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + export SRUN_EXEC="srun -N ${NHOSTS} -n ${NGPUS} -l -u" + else + echo "Skipping setupSrun() on $(hostname)" + fi +} + + # ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ # ┃ Use all available GPUs on all available nodes ┃ # ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ @@ -130,26 +145,36 @@ elasticDistributed() { ) elif [[ $(hostname) == nid* || $(hostname) == login* ]]; then echo "Setting up from Perlmutter on $(hostname)" - NHOSTS=${SLURM_NNODES-1} + # NHOSTS=${SLURM_NNODES-1} MACHINE="Perlmutter" + setupPerlmutter + setupSrun + echo "SRUN_EXEC: ${SRUN_EXEC}" # [ $(hostname) == nid* ] && NHOSTS="$SLURM_NNODES" || NHOSTS=1 # [ $(hostname) == nid* ] && export MACHINE="perlmutter" || export MACHINE="NERSC" - NGPU_PER_HOST=$(nvidia-smi -L | wc -l) + # NGPU_PER_HOST=$(nvidia-smi -L | wc -l) # NGPU_PER_HOST="$SLURM_GPUS_ON_NODE" - NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + # NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" # export MACHINE="perlmutter" - export MASTER_ADDR="127.0.0.1" - export MASTER_PORT="5432" + export MASTER_ADDR="$SLURMD_NODENAME" + # export MASTER_PORT="5432" EXEC_STR=( - "srun" - "-N ${NHOSTS}" - "-n ${NGPUS}" - "-l -u" + "${SRUN_EXEC}" "$(which python3)" "${MAIN}" "${gpt_args}" "${ds_args}" ) + # EXEC_STR=( + # "srun" + # "-N ${NHOSTS}" + # "-n ${NGPUS}" + # "-l -u" + # "$(which python3)" + # "${MAIN}" + # "${gpt_args}" + # "${ds_args}" + # ) else echo "Unexpected hostname $(hostname)" fi diff --git a/ALCF/setup.sh b/ALCF/setup.sh index 94416982b8..47b2d17667 100755 --- a/ALCF/setup.sh +++ b/ALCF/setup.sh @@ -176,7 +176,6 @@ function setupPolaris() { } function setupALCF() { - # if [[ $(hostname -s) == theta* || $(hostname -s) == x3* ]]; then echo "True" ; else echo "False" ; fi if [[ $(hostname) == theta* || $(hostname) == x3* ]]; then setupMPI [ $(hostname) == theta* ] && setupThetaGPU || echo "Skipping setupThetaGPU from $(hostname)" @@ -186,30 +185,45 @@ function setupALCF() { fi } + +function setupSrun() { + if [[ $(hostname) == login* || $(hostname) == nid* ]]; then + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + export SRUN_EXEC="srun -N ${NHOSTS} -n ${NGPUS} -l -u" + else + echo "Skipping setupSrun() on $(hostname)" + fi +} + # ┏━━━━━━━┓ # ┃ NERSC ┃ # ┗━━━━━━━┛ function setupPerlmutter() { if [[ $(hostname) == login* || $(hostname) == nid* ]]; then module load libfabric cudatoolkit pytorch/2.0.1 - if [[ $(hostname) == login* ]]; then - export MACHINE="NERSC" - module load pytorch/2.0.1 - export NHOSTS=1 - export NGPU_PER_HOST=1 - export NGPUS=1 - # echo "$(hostname)" > "${HERE}/hostfile" - elif [[ $(hostname) == nid* ]]; then - export NODELIST="${SLURM_JOB_NODELIST:-$(hostname)}" - export NODE_RANK=0 - export CUDA_DEVICE_MAX_CONNECTIONS=1 - export MACHINE="PERLMUTTER" - export NHOSTS="${SLURM_NNODES:-1}" - export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" - export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" - else - echo "Unexpected $(hostname) on NERSC" - fi + [ $SLURM_JOB_ID ] \ + && echo "Caught SLURM_JOB_ID: ${SLURM_JOB_ID}" \ + || echo "!!!!!! Running without SLURM allocation !!!!!!!!" + # if [[ $(hostname) == login* ]]; then + # export MACHINE="NERSC" + # module load pytorch/2.0.1 + # export NHOSTS=1 + # export NGPU_PER_HOST=1 + # export NGPUS=1 + # # echo "$(hostname)" > "${HERE}/hostfile" + # elif [[ $(hostname) == nid* ]]; then + # export NODE_RANK=0 + export NODELIST="${SLURM_JOB_NODELIST:-$(hostname)}" + # export CUDA_DEVICE_MAX_CONNECTIONS=1 + export MACHINE="Perlmutter" + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + # else + # echo "Unexpected $(hostname) on NERSC" + # fi echo "+++++++++++++++++++++++++++++++++++" echo "Using python: $(which python3)" echo "+++++++++++++++++++++++++++++++++++" @@ -228,9 +242,10 @@ function setupMachine() { [ "${HOSTNAME}==x3*" ] && condaPolaris elif [[ $(hostname) == nid* || $(hostname) == login* ]]; then export LAB="NERSC" + setupSrun setupPerlmutter - [ "${HOSTNAME}==login*" ] && setupPerlmutter - [ "${HOSTNAME}==nid*" ] && setupPerlmutter + # [ "${HOSTNAME}==login*" ] && setupPerlmutter + # [ "${HOSTNAME}==nid*" ] && setupPerlmutter else echo "Unexpected hostname: $(hostname)" fi From 3108af8c2b2d6d7e741e9c08950d2bfff1a51b32 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sun, 15 Oct 2023 22:13:24 -0500 Subject: [PATCH 015/268] Respect `WANDB_MODE=disabled` --- ALCF/args.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ALCF/args.sh b/ALCF/args.sh index 62b4c6494f..b5f018de98 100755 --- a/ALCF/args.sh +++ b/ALCF/args.sh @@ -279,6 +279,10 @@ CPU_OPTIM=" --cpu-optimizer" # OFFLOAD_DEVICE="none" # CPU_OPTIM=" " + +[ "${WANDB_MODE}" == "disabled" ] && WANDB_ENABLE="false" || WANDB_ENABLE="true" +echo "WANDB_ENABLE: ${WANDB_ENABLE}" + # ┏━━━━━━━━━━━━━━━━━━┓ # ┃ DeepSpeed Config ┃ # ┗━━━━━━━━━━━━━━━━━━┛ @@ -346,7 +350,7 @@ if [[ $ZERO_STAGE == "3" ]] ; then "debug": false }, "wandb": { -"enabled": true, +"enabled": $WANDB_ENABLE, "project": "GenSLM-Megatron-DS" } } @@ -403,7 +407,7 @@ else "debug": false }, "wandb": { -"enabled": true, +"enabled": $WANDB_ENABLE, "project": "GenSLM-Megatron-DS" } } From ba38d103bf342d78a75f061d189d7885f859d09d Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sun, 15 Oct 2023 22:13:48 -0500 Subject: [PATCH 016/268] Fix `hostname` check in `ALCF/setup.sh` --- ALCF/setup.sh | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/ALCF/setup.sh b/ALCF/setup.sh index 47b2d17667..d0fe4dadf9 100755 --- a/ALCF/setup.sh +++ b/ALCF/setup.sh @@ -175,17 +175,26 @@ function setupPolaris() { fi } + function setupALCF() { if [[ $(hostname) == theta* || $(hostname) == x3* ]]; then setupMPI - [ $(hostname) == theta* ] && setupThetaGPU || echo "Skipping setupThetaGPU from $(hostname)" - [ $(hostname) == x3* ] && setupPolaris || echo "Skipping setupPolaris from $(hostname)" + if [[ $(hostname) == theta* ]]; then + echo "Setting up ThetaGPU from $(hostname)" + setupThetaGPU + elif [[ $(hostname) == x3* ]]; then + echo "Setting up Polaris from $(hostname)" + setupPolaris + else + echo "Unknown hostname $(hostname) in setupALCF()" + fi else echo "Skipping setupALCF() on $(hostname)" fi } + function setupSrun() { if [[ $(hostname) == login* || $(hostname) == nid* ]]; then export NHOSTS="${SLURM_NNODES:-1}" @@ -238,8 +247,8 @@ function setupMachine() { if [[ $(hostname) == theta* || $(hostname) == x3* ]]; then export LAB="ALCF" setupALCF - [ "${HOSTNAME}==theta*" ] && condaThetaGPU - [ "${HOSTNAME}==x3*" ] && condaPolaris + # [ "${HOSTNAME}==theta*" ] && condaThetaGPU + # [ "${HOSTNAME}==x3*" ] && condaPolaris elif [[ $(hostname) == nid* || $(hostname) == login* ]]; then export LAB="NERSC" setupSrun @@ -255,7 +264,7 @@ function setupMachine() { # ┃ SETUP CONDA + MPI ENVIRONMENT @ ALCF ┃ # ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ function setup() { - export NCCL_DEBUG=warn + export NCCL_DEBUG=info # TORCH_EXTENSIONS_DIR="${HERE}/.cache/torch_extensions" export WANDB_CACHE_DIR="./cache/wandb" setupMachine From 3fb5c8c99355b8e64f490ff4ba26dbc8c1fbd067 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sun, 15 Oct 2023 22:14:18 -0500 Subject: [PATCH 017/268] Respect `WANDB_MODE=disabled` in `pretrain_gpt.py` --- pretrain_gpt.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 63b46ed0d9..b96cb4e25e 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -10,6 +10,7 @@ from functools import partial from megatron import get_args from megatron import print_rank_0 +from rich import print from megatron import get_timers from megatron import get_tokenizer from megatron.core import mpu, tensor_parallel @@ -48,8 +49,10 @@ WORLD_SIZE = get_world_size() LEVEL = "DEBUG" if RANK == 0 else "CRITICAL" -if RANK == 0: - print(f"Setting up W&B from: {RANK}") +WANDB_MODE = os.environ.get('WANDB_MODE', None) +DISABLE_WANDB = WANDB_MODE is not None and str(WANDB_MODE).lower() == 'disabled' + +if RANK == 0 and not DISABLE_WANDB: project_name = ( os.environ.get( 'WB_PROJECT', @@ -59,6 +62,9 @@ ), ) ) + print('--------------------------------------------------') + print(f"Setting up W&B from: {RANK} with {project_name}") + print('--------------------------------------------------') setup_wandb(project_name=project_name) From c303a749c1495ee84729a17997421d592e55b796 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 17 Oct 2023 19:29:24 -0500 Subject: [PATCH 018/268] (re-)set `NCCL_DEBUG=warn` --- ALCF/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ALCF/setup.sh b/ALCF/setup.sh index d0fe4dadf9..542e4d2358 100755 --- a/ALCF/setup.sh +++ b/ALCF/setup.sh @@ -264,7 +264,7 @@ function setupMachine() { # ┃ SETUP CONDA + MPI ENVIRONMENT @ ALCF ┃ # ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ function setup() { - export NCCL_DEBUG=info + export NCCL_DEBUG=warn # TORCH_EXTENSIONS_DIR="${HERE}/.cache/torch_extensions" export WANDB_CACHE_DIR="./cache/wandb" setupMachine From 10036071c905d6677052442693509ad2984a153f Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 17 Oct 2023 19:30:07 -0500 Subject: [PATCH 019/268] Change `--num-workers` from `1` to `0` --- ALCF/args.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ALCF/args.sh b/ALCF/args.sh index b5f018de98..08d3d8fcd2 100755 --- a/ALCF/args.sh +++ b/ALCF/args.sh @@ -452,7 +452,7 @@ gpt_args=( "--max-position-embeddings ${SEQ_LEN}" "--train-iters 10" "--lr-decay-iters 320000" - "--num-workers 1" + "--num-workers 0" "$DATA_LOAD_ARGS" "--data-impl mmap" "--split 949,50,1" From 01c57126d02287b35a0ec756bcd882bfdc4bc9d8 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sun, 22 Oct 2023 14:03:48 -0700 Subject: [PATCH 020/268] Update slurm commands in `ALCF/launch.sh` --- ALCF/launch.sh | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/ALCF/launch.sh b/ALCF/launch.sh index 5790727f7d..f7ba636836 100755 --- a/ALCF/launch.sh +++ b/ALCF/launch.sh @@ -100,7 +100,7 @@ fullNode() { } -function setupSrun() { +function setupSrunOld() { if [[ $(hostname) == login* || $(hostname) == nid* ]]; then export NODELIST="${SLURM_JOB_NODELIST:-$(hostname)}" export MACHINE="Perlmutter" @@ -113,6 +113,17 @@ function setupSrun() { fi } +function setupSrun() { + if [[ $(hostname) == login* || $(hostname) == nid* ]]; then + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + export SRUN_EXEC="srun --gpus ${NGPUS} --gpus-per-node ${NGPU_PER_HOST} -N ${NHOSTS} -n ${NGPUS} -l -u --verbose" + else + echo "Skipping setupSrun() on $(hostname)" + fi +} + # ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ # ┃ Use all available GPUs on all available nodes ┃ @@ -150,14 +161,7 @@ elasticDistributed() { setupPerlmutter setupSrun echo "SRUN_EXEC: ${SRUN_EXEC}" - # [ $(hostname) == nid* ] && NHOSTS="$SLURM_NNODES" || NHOSTS=1 - # [ $(hostname) == nid* ] && export MACHINE="perlmutter" || export MACHINE="NERSC" - # NGPU_PER_HOST=$(nvidia-smi -L | wc -l) - # NGPU_PER_HOST="$SLURM_GPUS_ON_NODE" - # NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" - # export MACHINE="perlmutter" export MASTER_ADDR="$SLURMD_NODENAME" - # export MASTER_PORT="5432" EXEC_STR=( "${SRUN_EXEC}" "$(which python3)" @@ -165,16 +169,6 @@ elasticDistributed() { "${gpt_args}" "${ds_args}" ) - # EXEC_STR=( - # "srun" - # "-N ${NHOSTS}" - # "-n ${NGPUS}" - # "-l -u" - # "$(which python3)" - # "${MAIN}" - # "${gpt_args}" - # "${ds_args}" - # ) else echo "Unexpected hostname $(hostname)" fi From beef50c8b0dc03b8c7f495726789ad04967176d1 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sun, 22 Oct 2023 16:04:37 -0500 Subject: [PATCH 021/268] Update `ALCF/launch.sh` to respect `HOSTFILE` --- ALCF/launch.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ALCF/launch.sh b/ALCF/launch.sh index f7ba636836..4c1a6ba0e0 100755 --- a/ALCF/launch.sh +++ b/ALCF/launch.sh @@ -132,10 +132,10 @@ elasticDistributed() { if [[ $(hostname) == theta* || $(hostname) == x3* ]]; then if [[ $(hostname) == theta* ]]; then echo "Setting up ThetaGPU from $(hostname)" - HOSTFILE="${COBALT_NODEFILE}" + HOSTFILE="${HOSTFILE:-${COBALT_NODEFILE}}" elif [[ $(hostname) == x3* ]]; then echo "Setting up Polaris from $(hostname)" - HOSTFILE="${PBS_NODEFILE}" + HOSTFILE="${HOSFILE:-${PBS_NODEFILE}}" else echo "Unknown hostname $(hostname)" exit 1 From b6c536c4cfd364395fab4b3cd8ea150d6a325519 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sun, 22 Oct 2023 16:04:53 -0500 Subject: [PATCH 022/268] Update `ALCF/setup.sh` to respect `HOSTFILE` --- ALCF/setup.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ALCF/setup.sh b/ALCF/setup.sh index 542e4d2358..5259ea03a0 100755 --- a/ALCF/setup.sh +++ b/ALCF/setup.sh @@ -34,8 +34,8 @@ function loadCondaEnv() { function thetagpuMPI() { if [[ $(hostname) == theta* ]]; then - export HOSTFILE="${COBALT_NODEFILE}" - NHOSTS=$(wc -l < "${COBALT_NODEFILE}") + export HOSTFILE="${HOSTFILE:-${COBALT_NODEFILE}}" + NHOSTS=$(wc -l < "${HOSTFILE}") NGPU_PER_HOST=$(nvidia-smi -L | wc -l) NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) NVME_PATH="/raid/scratch/" @@ -78,8 +78,8 @@ function thetagpuMPI() { function polarisMPI() { if [[ $(hostname) == x3* ]]; then - export HOSTFILE="${PBS_NODEFILE}" - export NHOSTS=$(wc -l < "${PBS_NODEFILE}") + export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" + export NHOSTS=$(wc -l < "${HOSTFILE}") export NGPU_PER_HOST=$(nvidia-smi -L | wc -l) export NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) export MPI_COMMAND=$(which mpiexec) From 39a484d2ea04c2b3714adc15ef2fe363f841e77f Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 23 Oct 2023 09:55:00 -0500 Subject: [PATCH 023/268] Add `GPT1T_*L` configs to `ALCF/model.sh` --- ALCF/model.sh | 76 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 67 insertions(+), 9 deletions(-) diff --git a/ALCF/model.sh b/ALCF/model.sh index c41a908203..d69ea8033a 100755 --- a/ALCF/model.sh +++ b/ALCF/model.sh @@ -258,27 +258,85 @@ A_NLAYERS[$MODEL_145B_KEY]=80 A_HIDDEN[$MODEL_145B_KEY]=12288 A_ATEN_HEADS[$MODEL_145B_KEY]=96 + + +MODEL_1T_HIDDEN=25600 +MODEL_1T_ATEN_HEADS=160 MODEL_1T_1L_KEY="GPT1T_1L" A_NLAYERS[$MODEL_1T_1L_KEY]=1 -A_HIDDEN[$MODEL_1T_1L_KEY]=25600 -A_ATEN_HEADS[$MODEL_1T_1L_KEY]=160 +A_HIDDEN[$MODEL_1T_1L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_1L_KEY]="${MODEL_1T_ATEN_HEADS}" MODEL_1T_2L_KEY="GPT1T_2L" A_NLAYERS[$MODEL_1T_2L_KEY]=2 -A_HIDDEN[$MODEL_1T_2L_KEY]=25600 -A_ATEN_HEADS[$MODEL_1T_2L_KEY]=160 +A_HIDDEN[$MODEL_1T_2L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_2L_KEY]="${MODEL_1T_ATEN_HEADS}" MODEL_1T_4L_KEY="GPT1T_4L" A_NLAYERS[$MODEL_1T_4L_KEY]=4 -A_HIDDEN[$MODEL_1T_4L_KEY]=25600 -A_ATEN_HEADS[$MODEL_1T_4L_KEY]=160 +A_HIDDEN[$MODEL_1T_4L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_4L_KEY]="${MODEL_1T_ATEN_HEADS}" MODEL_1T_8L_KEY="GPT1T_8L" A_NLAYERS[$MODEL_1T_8L_KEY]=8 -A_HIDDEN[$MODEL_1T_8L_KEY]=25600 -A_ATEN_HEADS[$MODEL_1T_8L_KEY]=160 - +A_HIDDEN[$MODEL_1T_8L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_8L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_16L_KEY="GPT1T_16L" +A_NLAYERS[$MODEL_1T_16L_KEY]=16 +A_HIDDEN[$MODEL_1T_16L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_16L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_24L_KEY="GPT1T_24L" +A_NLAYERS[$MODEL_1T_24L_KEY]=24 +A_HIDDEN[$MODEL_1T_24L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_24L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_30L_KEY="GPT1T_30L" +A_NLAYERS[$MODEL_1T_30L_KEY]=30 +A_HIDDEN[$MODEL_1T_30L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_30L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_32L_KEY="GPT1T_32L" +A_NLAYERS[$MODEL_1T_32L_KEY]=32 +A_HIDDEN[$MODEL_1T_32L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_32L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_60L_KEY="GPT1T_60L" +A_NLAYERS[$MODEL_1T_60L_KEY]=60 +A_HIDDEN[$MODEL_1T_60L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_60L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_64L_KEY="GPT1T_64L" +A_NLAYERS[$MODEL_1T_64L_KEY]=64 +A_HIDDEN[$MODEL_1T_64L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_64L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_120L_KEY="GPT1T_120L" +A_NLAYERS[$MODEL_1T_120L_KEY]=120 +A_HIDDEN[$MODEL_1T_120L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_120L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_128L_KEY="GPT1T_128L" +A_NLAYERS[$MODEL_1T_128L_KEY]=128 +A_HIDDEN[$MODEL_1T_128L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_128L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_256L_KEY="GPT1T_256L" +A_NLAYERS[$MODEL_1T_256L_KEY]=256 +A_HIDDEN[$MODEL_1T_256L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_256L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_512L_KEY="GPT1T_512L" +A_NLAYERS[$MODEL_1T_512L_KEY]=512 +A_HIDDEN[$MODEL_1T_512L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_512L_KEY]="${MODEL_1T_ATEN_HEADS}" + +MODEL_1T_1024L_KEY="GPT1T_1024L" +A_NLAYERS[$MODEL_1T_1024L_KEY]=1024 +A_HIDDEN[$MODEL_1T_1024L_KEY]="${MODEL_1T_HIDDEN}" +A_ATEN_HEADS[$MODEL_1T_1024L_KEY]="${MODEL_1T_ATEN_HEADS}" export MODEL_SIZE="${MODEL_SIZE_KEY}" export NLAYERS="${A_NLAYERS[$MODEL_SIZE_KEY]}" From 168d28f70ed45ab64efb3590548ef9b8476b5923 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 24 Oct 2023 11:36:54 -0700 Subject: [PATCH 024/268] Adds `train_sbatch_pp64.sh` --- train_sbatch_pp64.sh | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100755 train_sbatch_pp64.sh diff --git a/train_sbatch_pp64.sh b/train_sbatch_pp64.sh new file mode 100755 index 0000000000..b7baf2539e --- /dev/null +++ b/train_sbatch_pp64.sh @@ -0,0 +1,34 @@ +#!/bin/bash --login +#SBATCH -A m3957_g +#SBATCH -C 'gpu&hbm80g' +#SBATCH -q regular +#SBATCH -t 00:30:00 +#SBATCH --nodes 128 +#SBATCH --gpus 512 + + +# TODO:: +# - Add logic for catching / killing hung process at end of run to ensure +# second run starts up (otherwise, it will wait for the hung process, which +# will run until the job is killed) +# - This wll let us try running multiple experiments in a single slurm job +# allocation. +# - Existing (similar implementation) from my `~/bin/kill-match`: +# ```bash +# #!/bin/bash --login +# TO_KILL=$1 +# kill $(ps aux | grep -E "$USER.+($TO_KILL)" | grep -v grep | awk '{print $2}') + + +PPSIZE=64 \ + MODEL_SIZE_KEY="GPT1T_$(( 2 * PPSIZE ))L" \ + SEQ_LEN=2048 \ + MICRO_BATCH=2 \ + GAS=$(( 8 * PPSIZE )) \ + SP_TYPE=megatron \ + ZERO_STAGE=1 \ + USE_SEQUENCE_PARALLEL=0 \ + MPSIZE=8 \ + SPSIZE=1 \ + USE_ACTIVATION_CHECKPOINTING=1 \ + ./ALCF/train-gpt3.sh From cc24a6e7b750e23bc406de35f7cd7250faf4c480 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 20 Dec 2023 19:40:33 -0600 Subject: [PATCH 025/268] Add `ALCF/llama2_vars.sh` --- ALCF/llama2_vars.sh | 168 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 ALCF/llama2_vars.sh diff --git a/ALCF/llama2_vars.sh b/ALCF/llama2_vars.sh new file mode 100644 index 0000000000..b1fa8776f8 --- /dev/null +++ b/ALCF/llama2_vars.sh @@ -0,0 +1,168 @@ +#!/bin/bash +# This example script is contributed by external user https://github.com/nrailgun +# set -ex + +###################################### +# Change the below configurations here +# wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin +# wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx +MEGATRON_DIR="/home/foremans/datascience/foremans/locations/thetaGPU/projects/argonne-lcf/Megatron-DeepSpeed" +DATA_DIR="~/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DeepSpeed/dataset/" +BASE_PATH="${MEGATRON_DIR}" +DS_CONFIG=${BASE_PATH}/deepspeed.json +DATASET_1="${DATA_DIR}/BookCorpusDataset_text_document" +# DATASET_1="./tmp/data/bookcorpus_train_1m_text_sentence" +DATASET="1 ${DATASET_1}" +CHECKPOINT_PATH=./tmp +TOKENIZER_PATH=./tmp/tokenizer.model # offical llama tokenizer.model + +if [[ $(hostname) == nid* || $(hostname) == login* ]]; then + DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" + DATA_TYPE="BookCorpusDataset_text_document" +elif [[ $(hostname) == theta* || $(hostname) == x3* ]]; then + DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" + DATA_TYPE="genslm_subsample_200k_sequence_document" +else + echo "Unable to determine DATA_PARENT for $(hostname)." + echo "Exiting!" + exit 1 +fi + +DATA_DIR="${DATA_PARENT}/dataset" +DATA_PATH="${DATA_DIR}/${DATA_TYPE}" +VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" +MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" + +DATA_LOAD_ARGS=( + "--data-path $DATA_PATH" + "--vocab-file $VOCAB_FILE" + "--merge-file $MERGE_FILE" +) + +TP=2 +PP=2 +ZERO_STAGE=0 + +GPUS_PER_NODE=$(nvidia-smi -L | wc -l) +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=$(wc -l < "${PBS_NODEFILE:-${COBALT_NODEFILE:-1}}") +NODE_RANK=0 + +HIDDEN_SIZE=2048 # e.g. llama-13b: 5120 +FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824 +NUM_LAYERS=24 # e.g. llama-13b: 40 +NUM_HEADS=16 # e.g. llama-13b: 40 +SEQ_LENGTH=2048 +NUM_KV_HEADS=4 # llama2 70B uses GQA + +MICRO_BATCH_SIZE=4 +GLOBAL_BATCH_SIZE=32 # e.g. llama: 4M tokens +TRAIN_STEPS=250000 # e.g. llama: 1T tokens / 4M tokens_per_batch = 250000 steps +LR=3e-4 +MIN_LR=3e-5 +LR_WARMUP_STEPS=2000 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +## Activation checkpointing saves GPU memory, but reduces training speed +# activation_checkpoint="true" +activation_checkpoint="false" + +# Below configuration required for llama model as per llama paper +# --no-query-key-layer-scaling \ +# --attention-dropout 0 \ +# --hidden-dropout 0 \ +# --use-rotary-position-embeddings \ +# --untie-embeddings-and-output-weights \ +# --swiglu \ +# --normalization rmsnorm \ +# --disable-bias-linear \ +###################################### + + + +cat < $DS_CONFIG +{ + "train_batch_size" : $GLOBAL_BATCH_SIZE, + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "steps_per_print": 1, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + } +} +EOT + +ds_args="" +ds_args=" --deepspeed ${ds_args}" +ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" +ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" + +if [ "${activation_checkpoint}" = "true" ]; then + ds_args="--deepspeed-activation-checkpointing ${ds_args}" + + ## old argument for recomputing the transformer layer + # ds_args="--checkpoint-activations ${ds_args}" + + ## new argument for recomputing the transformer layer + ds_args="--recompute-granularity full --recompute-method uniform ${ds_args}" + ## new argument for recomputing only the attention layer + # ds_args="--recompute-granularity selective ${ds_args}" +fi + + +DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" + +# torchrun $DISTRIBUTED_ARGS \ +# pretrain_gpt.py \ +# +MEGATRON_ARGS=( + "--tensor-model-parallel-size $TP" + "--pipeline-model-parallel-size $PP" + "--num-layers $NUM_LAYERS" + "--hidden-size $HIDDEN_SIZE" + "--ffn-hidden-size $FFN_HIDDEN_SIZE" + "--num-attention-heads $NUM_HEADS" + "--micro-batch-size $MICRO_BATCH_SIZE" + "--global-batch-size $GLOBAL_BATCH_SIZE" + "--seq-length $SEQ_LENGTH" + "--max-position-embeddings $SEQ_LENGTH" + "--train-iters $TRAIN_STEPS" + "--save $CHECKPOINT_PATH" + "--load $CHECKPOINT_PATH" + "--data-path $DATASET" + "--data-impl mmap" + "--tokenizer-type GPTSentencePieceTokenizer" + "--tokenizer-model $TOKENIZER_PATH" + "--split 949,50,1" + "--distributed-backend nccl" + "--lr $LR" + "--lr-decay-style cosine" + "--min-lr $MIN_LR" + "--weight-decay $WEIGHT_DECAY" + "--clip-grad $GRAD_CLIP" + "--lr-warmup-iters $LR_WARMUP_STEPS" + "--optimizer adam" + "--adam-beta1 0.9" + "--adam-beta2 0.95" + "--log-interval 1" + "--save-interval 10000" + "--eval-interval 1000" + "--eval-iters 10" + "--bf16" + "--no-query-key-layer-scaling" + "--attention-dropout 0" + "--hidden-dropout 0" + "--use-rotary-position-embeddings" + "--untie-embeddings-and-output-weights" + "--swiglu" + "--normalization rmsnorm" + "--disable-bias-linear" + "--num-key-value-heads $NUM_KV_HEADS" +) + +MEGATRON_ARGS+="${ds_args}" +MEGATRON_ARGS+="${DATA_LOAD_ARGS}" From 1dddf70b300c078eafdac3178616fbd6a48b02a9 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 20 Dec 2023 19:42:46 -0600 Subject: [PATCH 026/268] Add `pretrain_llama.py` --- pretrain_llama.py | 562 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 562 insertions(+) create mode 100644 pretrain_llama.py diff --git a/pretrain_llama.py b/pretrain_llama.py new file mode 100644 index 0000000000..bfc10704e4 --- /dev/null +++ b/pretrain_llama.py @@ -0,0 +1,562 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Pretrain GPT""" + +import os +import torch +import math + +# import logging + +from functools import partial +from megatron import get_args +from megatron import print_rank_0 +from rich import print +from megatron import get_timers +from megatron import get_tokenizer +from megatron.core import mpu, tensor_parallel +from megatron.core.enums import ModelType +from megatron.data.gpt_dataset import build_train_valid_test_datasets +from megatron.model import GPTModel, GPTModelPipe +from megatron.training import pretrain +from megatron.utils import get_ltor_masks_and_position_ids +from megatron.utils import ( + average_losses_across_data_parallel_group, + update_rotary_pos_emb, +) +from megatron.arguments import core_transformer_config_from_args + +import deepspeed +from deepspeed.runtime.utils import see_memory_usage +from deepspeed.accelerator.real_accelerator import get_accelerator +import subprocess +import wandb + +from torch import nn +import torch.nn.functional as F + +# from ezpz import get_logger +from ezpz.dist import setup_torch, get_world_size, setup_wandb + +RANK = setup_torch( + backend="deepspeed", + port="5432", +) +WORLD_SIZE = get_world_size() +LEVEL = "DEBUG" if RANK == 0 else "CRITICAL" + +WANDB_MODE = os.environ.get("WANDB_MODE", None) +DISABLE_WANDB = ( + WANDB_MODE is not None and str(WANDB_MODE).lower() == "disabled" +) + +if RANK == 0 and not DISABLE_WANDB: + project_name = os.environ.get( + "WB_PROJECT", + os.environ.get("WANDB_PROJECT", "GenSLM-Megatron-DS"), + ) + print("--------------------------------------------------") + print(f"Setting up W&B from: {RANK} with {project_name}") + print("--------------------------------------------------") + setup_wandb(project_name=project_name) + + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + print_rank_0("building GPT model ...") + see_memory_usage("Before Building Model", force=True) + args = get_args() + config = core_transformer_config_from_args(args) + # args = get_args() + # timers = get_timers() + if wandb.run is not None: + print(f"Updating WandB run: [{wandb.run.name}]({wandb.run.url})") + wandb.run.config.update({"args": vars(args)}) + if RANK == 0: + git_ds_info() + + with deepspeed.zero.Init( + sequence_data_parallel_group=mpu.get_sequence_data_parallel_group(), + remote_device=( + None if args.remote_device == "none" + else args.remote_device, + ), + config_dict_or_path=args.deepspeed_config, + enabled=args.zero_stage == 3, + mpu=mpu, + ): + if args.deepspeed and not args.no_pipeline_parallel: + model = GPTModelPipe( + config=config, + num_tokentypes=0, + parallel_output=True + ) + # This is a hack to give us a reference to get_batch_pipe from + # within training.py We need to call model.set_batch_fn after + # deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + + # Predompute the attention mask and store it in args. This avoids + # having to pipeline it as an activation during training. The mask + # is constant, and thus we can reuse it. + attention_mask = torch.tril( + torch.ones( + (1, args.seq_length, args.seq_length), + device=get_accelerator().current_device_name(), + ) + ).view(1, 1, args.seq_length, args.seq_length) + + # Convert attention mask to binary: + attention_mask = attention_mask < 0.5 + if args.fp16: + attention_mask = attention_mask.half() + elif args.bf16: + attention_mask = attention_mask.bfloat16() + + # Attention mask must be bool. + args.attn_mask = attention_mask.to(torch.bool) + + # For prertaining, since sequence length is fixed, cache rotary + # embedding in args, to avoid communicating around + if args.use_rotary_position_embeddings: + update_rotary_pos_emb(args.seq_length) + + else: + model = GPTModel( + config=config, + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + ) + num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + # print_rank_0('\n ------------------------ ') + # print_rank_0(f'num of parameters {num_params}') + # print_rank_0('------------------------\n ') + print_rank_0(80 * "-") + print_rank_0(f"Number of parameters in model: {num_params}") + print_rank_0(80 * "-") + see_memory_usage("After Building Model", force=True) + if wandb.run is not None: + wandb.run.watch( + model, + log="all", + log_graph=True, + ) + wandb.run.config.update({"num_params": num_params}) + return model + + +def get_batch(data_iterator): + """Generate a batch""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = ["text"] + datatype = torch.int64 + + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b["text"].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Get the masks and postition ids. + skip_mask = ( + hasattr(args, "use_flash_attn") + or hasattr(args, "flash_attn_triton") + ) + # skip_mask = args.use_flash_attn or args.use_flash_attn_triton + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + skip_mask, + ) + + # For DS's sequence parallel + seq_parallel_world_size = mpu.get_sequence_parallel_world_size() + seq_parallel_world_rank = mpu.get_sequence_parallel_rank() + + # For Megatron's sequence parallel + if args.sequence_parallel: + seq_parallel_world_size = mpu.get_tensor_model_parallel_world_size() + seq_parallel_world_rank = mpu.get_tensor_model_parallel_rank() + seq_length = tokens.size(1) + + assert seq_length % seq_parallel_world_size == 0 + sub_seq_length = seq_length // seq_parallel_world_size + sub_seq_start = seq_parallel_world_rank * sub_seq_length + sub_seq_end = (seq_parallel_world_rank + 1) * sub_seq_length + + tokens = tokens[:, sub_seq_start:sub_seq_end] + position_ids = position_ids[:, sub_seq_start:sub_seq_end] + # For DS's sequence parallel + if mpu.get_sequence_parallel_world_size() > 1: + labels = labels[:, sub_seq_start:sub_seq_end] + + return tokens, labels, loss_mask, attention_mask, position_ids + + +def data_post_process(data, data_sampler_state_dict): + args = get_args() + if args.data_efficiency_curriculum_learning: + if ( + "seqlen_truncate" in data_sampler_state_dict[ + "current_difficulties" + ] + ): + args.data_efficiency_curriculum_learning_seqlen_type = ( + "seqlen_truncate" + ) + current_seqlen = ( + data_sampler_state_dict["current_difficulties"][ + "seqlen_truncate" + ] + ) + if current_seqlen < args.seq_length: + data["text"] = ( + data["text"][:, : (current_seqlen + 1)].contiguous() + ) + elif ( + "seqlen_reshape" in data_sampler_state_dict["current_difficulties"] + ): + args.data_efficiency_curriculum_learning_seqlen_type = ( + "seqlen_reshape" + ) + current_seqlen = ( + data_sampler_state_dict["current_difficulties"][ + "seqlen_reshape" + ] + ) + if current_seqlen < args.seq_length: + orig_num_token = torch.numel(data["text"]) + reshape_len = ( + (data["text"].size()[1] // (current_seqlen + 1)) + * (current_seqlen + 1) + ) + data["text"] = torch.cat( + ( + data["text"][:, :reshape_len] + .contiguous() + .view(-1, current_seqlen + 1), + data["text"][:, -(current_seqlen + 1):], + ), + 0, + ).contiguous() + num_row = math.ceil(orig_num_token / (current_seqlen + 1)) + num_row = min(num_row, data["text"].size()[0]) + if num_row > 1 and num_row % 2 != 0: + num_row -= 1 + data["text"] = data["text"][:num_row, :].contiguous() + else: + args.data_efficiency_curriculum_learning_seqlen_type = None + return data + + +def get_batch_pipe(data): + """Modification of `get_batch` to work on `next(data_iterator)` instead of + `data_iterator`""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = ["text"] + datatype = torch.int64 + + # Broadcast data. + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b["text"].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + ) + if ( + args.curriculum_learning_legacy + and args.curriculum_seqlen < tokens.size()[1] + ): + # seqlen-based curriculum learning + # tokens, position_ids, labels, loss_mask have size: + # [batch size, seqlen] + tokens = tokens[:, : args.curriculum_seqlen].contiguous() + position_ids = position_ids[:, : args.curriculum_seqlen].contiguous() + if labels is not None: + labels = labels[:, : args.curriculum_seqlen].contiguous() + loss_mask = loss_mask[:, : args.curriculum_seqlen].contiguous() + + return (tokens, position_ids, attention_mask), (labels, loss_mask) + + +def loss_func(loss_mask, moe_loss, mos_loss, output_tensor): + args = get_args() + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + if args.mos or args.kd: + # assert max(args.num_experts) >= 1 + loss = loss + moe_loss + mos_loss + if args.mos: + return loss, { + "total loss": loss, + "lm loss": averaged_loss[0], + "moe loss": moe_loss, + "mos loss": mos_loss, + } + elif args.kd: + return loss, { + "total loss": loss, + "lm loss": averaged_loss[0], + "moe loss": moe_loss, + "kd loss": mos_loss, + } + print_rank_0( + ">>> total loss: {}, lm loss {}, kd loss {}".format( + loss, averaged_loss[0], mos_loss + ) + ) + else: + if max(args.num_experts) <= 1: + return loss, {"lm loss": averaged_loss[0]} + else: + loss = loss + moe_loss + return loss, {"lm loss": averaged_loss[0], "moe loss": moe_loss} + + +def calculate_mos_loss( + args, stu_output, teacher_model, tokens, position_ids, attention_mask +): + mos_loss = 0 + alpha = args.kd_alpha_ce + beta = args.kd_beta_ce + kd_temp = args.kd_temp + + if teacher_model: + with torch.no_grad(): + if ( + args.curriculum_learning_legacy + and args.curriculum_seqlen < args.seq_length + ): + assert args.curriculum_seqlen is not None + curriculum_seqlen = args.curriculum_seqlen + tokens = tokens[:, :curriculum_seqlen].contiguous() + position_ids = position_ids[:, :curriculum_seqlen].contiguous() + attention_mask = attention_mask[ + :, :, :curriculum_seqlen, :curriculum_seqlen + ].contiguous() + # No need to truncate labels as we do not need it for the + # teacher logits + tea_output, tea_other_losses = teacher_model( + tokens, position_ids, attention_mask + ) + assert ( + stu_output.size() == tea_output.size() + ), ( + "teacher and student output should match in size. " + f"Student: {stu_output.size()}, " + f"Teacher: {tea_output.size()}, " + f"CL seq length {args.curriculum_seqlen}" + ) + + student_logits = F.log_softmax(stu_output / kd_temp, dim=2) + tea_logits = F.softmax( + tea_output / kd_temp, dim=2 + ) + # The target logits is expected to be probabilities. If we use + # log_softmax, then we need to set target_log to true when initializing + # the KLDivLoss. + mos_loss = ( + kd_temp + * kd_temp + * nn.KLDivLoss(reduction="batchmean")(student_logits, tea_logits) + ) + + mos_loss = mos_loss.div(args.seq_length) * beta + return mos_loss + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers("batch-generator", log_level=2).start() + tokens, labels, loss_mask, attention_mask, position_ids = ( + get_batch(data_iterator) + ) + timers("batch-generator").stop() + + if args.data_efficiency_curriculum_learning: + args.curriculum_seqlen = tokens.size()[1] + if ( + hasattr( + args, + "data_efficiency_curriculum_learning_seqlen_type" + ) + and ( + args.data_efficiency_curriculum_learning_seqlen_type + == "seqlen_reshape" + ) + ): + args.data_efficiency_curriculum_learning_numel = ( + torch.numel(tokens) + ) + + if args.mos or args.kd: + # The forward func can return either the loss or the logits, depending + # on whether passing in the labels or not. + stu_output, other_losses = model(tokens, position_ids, attention_mask) + if ( + args.curriculum_learning_legacy + and args.curriculum_seqlen < args.seq_length + ): + assert args.curriculum_seqlen is not None + labels = labels[:, : args.curriculum_seqlen].contiguous() + output_tensor = tensor_parallel.vocab_parallel_cross_entropy( + stu_output.contiguous().float(), labels + ) + else: + output_tensor, other_losses = model( + tokens, position_ids, attention_mask, labels=labels + ) + if ( + args.curriculum_learning_legacy + and args.curriculum_seqlen < args.seq_length + ): + loss_mask = loss_mask[:, : args.curriculum_seqlen].contiguous() + + moe_losses = [] + for moe_loss in other_losses: + if moe_loss is not None: + moe_losses.append(moe_loss) + moe_loss = sum(moe_losses) * args.moe_loss_coeff + + mos_loss = 0 + if args.mos or args.kd: + assert model.training + if args.teacher_forward and args.teacher_model is not None: + mos_loss = calculate_mos_loss( + args, + stu_output, + args.teacher_model[0], + tokens, + position_ids, + attention_mask, + ) + + # Output_tensor stores the standard loss, loos_func calculates the total + # loss. + return output_tensor, partial(loss_func, loss_mask, moe_loss, mos_loss) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + + print_rank_0( + "> building train, validation, and test datasets " "for GPT ..." + ) + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length, + seed=args.seed, + skip_warmup=(not args.mmap_warmup), + train_data_prefix=args.train_data_path, + valid_data_prefix=args.valid_data_path, + test_data_prefix=args.test_data_path, + data_cache_path=args.data_cache_path, + ) + print_rank_0("> finished creating GPT datasets ...") + + return train_ds, valid_ds, test_ds + + +def command_exists(cmd): + result = subprocess.Popen( + f"type {cmd}", + stdout=subprocess.PIPE, + shell=True + ) + return result.wait() == 0 + + +def git_ds_info(): + from deepspeed.env_report import main as ds_report + + ds_report() + + # Write out version/git info + git_hash_cmd = "git rev-parse --short HEAD" + git_branch_cmd = "git rev-parse --abbrev-ref HEAD" + if command_exists("git"): + try: + result = subprocess.check_output(git_hash_cmd, shell=True) + git_hash = result.decode("utf-8").strip() + result = subprocess.check_output(git_branch_cmd, shell=True) + git_branch = result.decode("utf-8").strip() + except subprocess.CalledProcessError: + git_hash = "unknown" + git_branch = "unknown" + else: + git_hash = "unknown" + git_branch = "unknown" + print( + f"**** Git info for Megatron: " + f"git_hash={git_hash} git_branch={git_branch} ****" + ) + + +def main(): + model = pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={"tokenizer_type": "GPT2BPETokenizer"}, + data_post_process=data_post_process, + ) + return model + + +if __name__ == "__main__": + # git_ds_info() + # pretrain(train_valid_test_datasets_provider, + # model_provider, + # ModelType.encoder_or_decoder, + # forward_step, + # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + # data_post_process=data_post_process) + import sys + import deepspeed.comm as dist + + model = main() + dist.log_summary() + if wandb.run is not None: + print(f"wandb.run.name: {wandb.run.name}") + print(f"wandb.run.url: {wandb.run.url}") + wandb.finish() + sys.exit() From b6fb158e110640cf5a8e387a48c3727f6fd18bde Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 21 Dec 2023 15:26:15 -0600 Subject: [PATCH 027/268] Support Llama2 pretraining --- ALCF/launch.sh | 20 +-- ALCF/llama2_vars.sh | 323 ++++++++++++++++++++++++++++++++++++++++---- ALCF/model.sh | 39 ++++++ pretrain_llama.py | 85 ++++++++---- 4 files changed, 401 insertions(+), 66 deletions(-) mode change 100644 => 100755 ALCF/llama2_vars.sh diff --git a/ALCF/launch.sh b/ALCF/launch.sh index 4c1a6ba0e0..41a620a145 100755 --- a/ALCF/launch.sh +++ b/ALCF/launch.sh @@ -37,14 +37,14 @@ MASTER_PORT=20010 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" MPI_WRAPPER="${SCRIPT_DIR}/mpi_wrapper" -sourceFile "${ALCF_DIR}/args.sh" +# sourceFile "${ALCF_DIR}/args.sh" MAIN="${PARENT}/pretrain_${MODEL_TYPE}.py" printJobInfo() { echo "Job started at: ${TSTAMP} on $(hostname)" echo "Job running in: ${DIR}" - echo "Training GPT-3 with ${MODEL_SIZE} parameters" + echo "Training Llama2 with ${MODEL_SIZE} parameters" echo "Writing logs to: ${OUTPUT_DIR}" echo 'to view output: tail -f $(tail -1 logfiles)' echo "i.e. tail -f $(tail -1 "${PARENT}"/logfiles)" @@ -70,7 +70,7 @@ fullNode() { echo "NGPUS ${NGPUS}" echo "hostfile ${DIR}/hostfile" echo "MAIN ${MAIN}" - echo "gpt_args ${gpt_args}" + echo "gpt_args ${ARGS}" NHOSTS=$(wc -l < "${HOSTFILE}") NGPU_PER_HOST=$(nvidia-smi -L | wc -l) NGPUS=$((${NHOSTS}*${NGPU_PER_HOST})) @@ -87,8 +87,8 @@ fullNode() { "${MASTER_ADDR}" "${MASTER_PORT}" "${MAIN}" - "${gpt_args}" - "${ds_args}" + "${ARGS}" + # "${ds_args}" ) # EXEC=$(join_by ' ' "${EXEC[*]}") EXEC="${EXEC[*]}" @@ -151,8 +151,9 @@ elasticDistributed() { "${MPI_ELASTIC}" "$(which python3)" "${MAIN}" - "${gpt_args}" - "${ds_args}" + "${ARGS}" + # "${gpt_args}" + # "${ds_args}" ) elif [[ $(hostname) == nid* || $(hostname) == login* ]]; then echo "Setting up from Perlmutter on $(hostname)" @@ -166,8 +167,9 @@ elasticDistributed() { "${SRUN_EXEC}" "$(which python3)" "${MAIN}" - "${gpt_args}" - "${ds_args}" + "${ARGS}" + # "${gpt_args}" + # "${ds_args}" ) else echo "Unexpected hostname $(hostname)" diff --git a/ALCF/llama2_vars.sh b/ALCF/llama2_vars.sh old mode 100644 new mode 100755 index b1fa8776f8..2fc8c3898a --- a/ALCF/llama2_vars.sh +++ b/ALCF/llama2_vars.sh @@ -1,19 +1,76 @@ #!/bin/bash # This example script is contributed by external user https://github.com/nrailgun +# [2023-12-20]: Modified by [@saforem2](https://github.com/saforem2) # set -ex +# +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' +} + +function join_by { local d=${1-} f=${2-}; if shift 2; then printf %s "$f" "${@/#/$d}"; fi; } + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + + +USER=$(whoami) +HERE=$(WhereAmI) +ALCF_DIR="${HERE}/ALCF" +PARENT=$(dirname "${ALCF_DIR}") +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "PARENT: ${PARENT}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + +HOSTNAME=$(hostname) +sourceFile "${ALCF_DIR}/setup.sh" +sourceFile "${ALCF_DIR}/model.sh" + +WORLD_SIZE="${NGPUS}" +PARALLEL_SIZE="${WORLD_SIZE}" +echo "NHOSTS * (NGPU / HOST) = $NHOSTS * $NGPU_PER_HOST = $NGPUS" + +# MODEL_LLAMA_KEY="LLAMA-24L" +# HIDDEN_SIZE=2048 # e.g. llama-13b: 5120 +# FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824 +# NUM_LAYERS=24 # e.g. llama-13b: 40 +# NUM_HEADS=16 # e.g. llama-13b: 40 +# SEQ_LENGTH=2048 +# NUM_KV_HEADS=4 # llama2 70B uses GQA +# FFN_HIDDEN_SIZE=5504 +# NUM_HEADS=16 # e.g. llama-13b: 40 ###################################### # Change the below configurations here # wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin # wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx -MEGATRON_DIR="/home/foremans/datascience/foremans/locations/thetaGPU/projects/argonne-lcf/Megatron-DeepSpeed" -DATA_DIR="~/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DeepSpeed/dataset/" + +USER=$(whoami) +HERE=$(WhereAmI) +ALCF_DIR="${HERE}/ALCF" +PARENT=$(dirname "${ALCF_DIR}") +# echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +# echo "ALCF_DIR: ${ALCF_DIR}" +# # echo "PARENT: ${PARENT}" +# echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + +MEGATRON_DIR="${HERE}" + +# DATA_DIR="${HOME}/datascience/foremans/locations/thetaGPU/projects/saforem2/Megatron-DeepSpeed/dataset/" BASE_PATH="${MEGATRON_DIR}" DS_CONFIG=${BASE_PATH}/deepspeed.json DATASET_1="${DATA_DIR}/BookCorpusDataset_text_document" # DATASET_1="./tmp/data/bookcorpus_train_1m_text_sentence" DATASET="1 ${DATASET_1}" -CHECKPOINT_PATH=./tmp +# CHECKPOINT_PATH=./tmp TOKENIZER_PATH=./tmp/tokenizer.model # offical llama tokenizer.model if [[ $(hostname) == nid* || $(hostname) == login* ]]; then @@ -33,15 +90,20 @@ DATA_PATH="${DATA_DIR}/${DATA_TYPE}" VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "MEGATRON_DIR: ${MEGATRON_DIR}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + DATA_LOAD_ARGS=( "--data-path $DATA_PATH" "--vocab-file $VOCAB_FILE" "--merge-file $MERGE_FILE" ) -TP=2 -PP=2 -ZERO_STAGE=0 +# TP=2 +# PP=2 +# ZERO_STAGE=0 GPUS_PER_NODE=$(nvidia-smi -L | wc -l) MASTER_ADDR=localhost @@ -49,15 +111,38 @@ MASTER_PORT=6000 NNODES=$(wc -l < "${PBS_NODEFILE:-${COBALT_NODEFILE:-1}}") NODE_RANK=0 -HIDDEN_SIZE=2048 # e.g. llama-13b: 5120 -FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824 -NUM_LAYERS=24 # e.g. llama-13b: 40 -NUM_HEADS=16 # e.g. llama-13b: 40 -SEQ_LENGTH=2048 +# TP=2 +# PP=2 +# ZERO_STAGE=0 +# +export SEQ_LENGTH=${SEQ_LENGTH:-2048} +export NUM_KV_HEADS=4 # llama2 70B uses GQA +export MODEL_SIZE_KEY="${MODEL_SIZE_KEY:-LLAMA_7B}" +export MODEL_TYPE=${MODEL_TYPE:-llama} +echo "==========================+" +echo "Using ${MODEL_SIZE_KEY}" +echo "==========================+" + + +export DDP_IMPL="local" +export GAS=${GAS:-1} +export MPSIZE=${MPSIZE:-1} +export SPSIZE=${SPSIZE:-1} +export PPSIZE=${PPSIZE:-1} +export SP_TYPE=${SP_TYPE:-"ds"} +export MICRO_BATCH=${MICRO_BATCH:-1} + +# export HIDDEN_SIZE=2048 # e.g. llama-13b: 5120 +# export FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824 +# export NUM_LAYERS=24 # e.g. llama-13b: 40 +# export NUM_HEADS=16 # e.g. llama-13b: 40 +# export SEQ_LENGTH=${SEQ_LENGTH:-2048} +# export NUM_KV_HEADS=4 # llama2 70B uses GQA + NUM_KV_HEADS=4 # llama2 70B uses GQA +FFN_HIDDEN_SIZE=5504 -MICRO_BATCH_SIZE=4 -GLOBAL_BATCH_SIZE=32 # e.g. llama: 4M tokens +# GLOBAL_BATCH=32 # e.g. llama: 4M tokens TRAIN_STEPS=250000 # e.g. llama: 1T tokens / 4M tokens_per_batch = 250000 steps LR=3e-4 MIN_LR=3e-5 @@ -80,18 +165,119 @@ activation_checkpoint="false" # --disable-bias-linear \ ###################################### +# Deal with Sequence Parallel implementation --------------------------------------- +# ---------------------------------------------------------------------------------- +if [[ ${SP_TYPE} == "ds" ]]; then + # NOTE: -------------------------------------------------------------------- + # SP_TYPE="ds" has NO effect, essentially running with no Seq. parallelism + # -------------------------------------------------------------------------- + if [[ "$MPSIZE" == "${WORLD_SIZE}" ]]; then + # hacky workaround to try and use SP_TYPE="ds" + MPSIZE="${WORLD_SIZE}" + # ------------------------------------------------------------------------ + # Update [2023-08-22]: Chengming mentioned that this is an internal issue + # and will NOT work currently + # ------------------------------------------------------------------------ + echo "Caught MPSIZE: $MPSIZE from env. Setting SPSIZE=1" + SPSIZE=1 + MPSIZE="${MPSIZE}" + else + echo "Didn't catch MPSIZE from env. Setting SPSIZE=${WORLD_SIZE}, MPSIZE=1" + MPSIZE=1 + SPSIZE="${WORLD_SIZE}" + fi + if [ -z "${ZERO_STAGE}" ]; then + echo "ZERO_STAGE not set, setting to 3 for ${SP_TYPE}" + ZERO_STAGE=3 + else + echo "Caught ZERO_STAGE=${ZERO_STAGE} with ${SP_TYPE}" + fi + export SPSIZE="${SPSIZE:-$WORLD_SIZE}" + export MPSIZE="${MPSIZE:-1}" + export USE_SEQUENCE_PARALLEL=0 + export ZERO_STAGE="${ZERO_STAGE}" +elif [[ ${SP_TYPE} == "megatron" ]]; then + # NOTE: -------------------------------------------------------------------------- + # SP_TYPE="megatron" will use Megatron's Seq. || implementation with ZERO_STAGE=0 + # -------------------------------------------------------------------------------- + [ "$SPSIZE" ] && echo "Caught SPSIZE: ${SPSIZE} from env" || SPSIZE=1 + [ "$MPSIZE" ] && echo "Caught MPSIZE: ${MPSIZE} from env" || MPSIZE="${WORLD_SIZE}" + [ "$ZERO_STAGE" ] && echo "Caught ${ZERO_STAGE} from env" || ZERO_STAGE=0 + [ "$USE_SEQUENCE_PARALLEL" ] && echo "Caught USE_SP: $USE_SEQUENCE_PARALLEL from env" || USE_SEQUENCE_PARALLEL=1 + if [[ ${PPSIZE} > 1 ]]; then # && ${MPSIZE}==${WORLD_SIZE} ]]; + MPSIZE=$(( WORLD_SIZE / PPSIZE )) + echo "Re-setting MPSIZE to ${WORLD_SIZE} / ${PPSIZE} = $(( WORLD_SIZE / PPSIZE ))" + echo "MPSIZE: $MPSIZE" + # MPSIZE="${WORLD_SIZE}/" + fi + export SPSIZE="${SPSIZE}" + export MPSIZE="${MPSIZE}" + export ZERO_STAGE="${ZERO_STAGE}" + export USE_SEQUENCE_PARALLEL="${USE_SEQUENCE_PARALLEL:-1}" +else + echo "Unexpected SP_TYPE: ${SP_TYPE}" + # exit 1 +fi +# ------------------------------------------------------------------------ +# +echo "####################################################" +echo "USING: ${SP_TYPE}" +echo "SPSIZE: ${SPSIZE}" +echo "PPSIZE: ${SPSIZE}" +echo "MPSIZE: ${MPSIZE}" +echo "ZERO_STAGE: ${ZERO_STAGE}" +echo "WORLD_SIZE: ${WORLD_SIZE}" +echo "USE_SEQUENCE_PARALLEL: ${USE_SEQUENCE_PARALLEL}" +echo "####################################################" + +echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++" +echo "${SP_TYPE} sequence parallelism, with: " +echo " {MPSIZE: ${MPSIZE}, SPSIZE: ${SPSIZE}, USE_SEQUENCE_PARALLEL: ${USE_SEQUENCE_PARALLEL}} !!" +echo "########################################################" + +GLOBAL_BATCH=$(( NGPUS * MICRO_BATCH * GAS )) + +GLOBAL_BATCH=$(( GLOBAL_BATCH / MPSIZE / PPSIZE / SPSIZE)) + +echo "GB = (NGPUS * MB * GAS) / (MP * PP * SP * DP) = ${NGPUS} * ${MICRO_BATCH} * ${GAS} = ${GLOBAL_BATCH} / (${MPSIZE} * ${PPSIZE} * ${PPSIZE})" +# echo "GB = (NGPUS * MB * GAS) / (MP * PP * SP) = (${NGPUS} * ${MICRO_BATCH} * ${GAS}) / (${MPSIZE} * ${PPSIZE} * ${SPSIZE}) = ${GLOBAL_BATCH}" + +if [[ "${GLOBAL_BATCH}" == 0 ]]; then + GLOBAL_BATCH=1 +fi +# [ "${GLOBAL_BATCH:-${GLOBAL_BATCH}}" == 0 ] && GLOBAL_BATCH=1 || echo "GLOBAL_BATCH: ${GLOBAL_BATCH}" + +DPSIZE=$(( $WORLD_SIZE / $PPSIZE / $MPSIZE )) + +export GLOBAL_BATCH="$(( GLOBAL_BATCH * DPSIZE ))" +# export GLOBAL_BATCH="$GLOBAL_BATCH" +# echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" +# echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + +echo "--------------------------------" +echo "GLOBAL_BATCH=${GLOBAL_BATCH}" +echo "USING DPSIZE: ${DPSIZE}" +echo "--------------------------------" + +# REMAINDER=$(( GLOBAL_BATCH % (MICRO_BATCH * DPSIZE))) +# if [[ "${GLOBAL_BATCH} "]] + + cat < $DS_CONFIG { - "train_batch_size" : $GLOBAL_BATCH_SIZE, - "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size" : $GLOBAL_BATCH, + "train_micro_batch_size_per_gpu": $MICRO_BATCH, "steps_per_print": 1, "zero_optimization": { "stage": $ZERO_STAGE }, "bf16": { "enabled": true + }, + "wandb": { + "enabled": true, + "project": "GenSLM-Megatron-DS" } } EOT @@ -118,21 +304,81 @@ DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $ # torchrun $DISTRIBUTED_ARGS \ # pretrain_gpt.py \ -# -MEGATRON_ARGS=( - "--tensor-model-parallel-size $TP" - "--pipeline-model-parallel-size $PP" - "--num-layers $NUM_LAYERS" - "--hidden-size $HIDDEN_SIZE" + + +# ┏━━━━━━━━━━━━━━━━━━━┓ +# ┃ FILE I/O SETTINGS ┃ +# ┗━━━━━━━━━━━━━━━━━━━┛ +RUN_STR="gb${GLOBAL_BATCH}_mb${MICRO_BATCH}" +RUN_STR="nl${NLAYERS}_hs${HIDDEN}_${RUN_STR}" +RUN_STR="mp${MPSIZE}_pp${PPSIZE}_sp${SPSIZE}_${RUN_STR}" +RUN_STR="z${ZERO_STAGE}_seqlen${SEQ_LEN}_${RUN_STR}" +RUN_STR="${MODEL_SIZE_KEY}_${RUN_STR}" + +# if [[ "${USE_FLASH_ATTN}" == 0 ]]; then +# echo "Not using Flash Attention!!" +# else +# +if [[ "${USE_FLASH_ATTN1}" || "${USE_FLASH_ATTN_V1}" ]]; then + # Flash Attention 1 + [ "${USE_FLASH_ATTN}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" + [ "${USE_FLASH_ATTN1}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" + [ "${USE_FLASH_ATTN_V1}" ] && RUN_STR="flashAttn_v1_${RUN_STR}" +elif [[ "${USE_FLASH_ATTN2}" || "${USE_FLASH_ATTN_V2}" ]]; then + # Flash Attention 2 + [ "${USE_FLASH_ATTN2}" ] && RUN_STR="flashAttn_v2_${RUN_STR}" + [ "${USE_FLASH_ATTN_V2}" ] && RUN_STR="flashAttn_v2_${RUN_STR}" +elif [[ "${USE_FLASH_ATTN_TRITON}" ]]; then + # Triton + Flash Attn + # Triton + Flash Attn + [ "${USE_FLASH_ATTN_TRITON}" ] && RUN_STR="flashAttn_triton_${RUN_STR}" +else + echo "Not using Flash Attention!" +fi + +if [[ $DDP_IMPL == 'FSDP' ]]; then + RUN_STR="FSDP_${RUN_STR}" +fi +if [[ $USE_ACTIVATION_CHECKPOINTING == 1 ]]; then + RUN_STR="actCkpt_${RUN_STR}" +fi +if [[ $USE_SEQUENCE_PARALLEL == 1 ]] ; then + RUN_STR="SP_${RUN_STR}" +fi + +RUN_STR="${MODEL_SIZE}_${RUN_STR}" + +OUTPUT_DIR="${PARENT}/outputs/${RUN_STR}" +CHECKPOINT_DIR="${PARENT}/checkpoints/$RUN_STR" +TENSORBOARD_DIR="${PARENT}/outputs/${RUN_STR}/tensorboard" + +DATE=$(date) +export DATE="${DATE}" +export RUN_STR="${RUN_STR}" +export MODEL_SIZE="${MODEL_SIZE:-${MODEL_SIZE_KEY}}" +export MODEL_SIZE="$MODEL_SIZE" +export TENSORBOARD_DIR=$TENSORBOARD_DIR +export OUTPUT_DIR=$OUTPUT_DIR +mkdir -p "$OUTPUT_DIR/tensorboard/wandb" +mkdir -p "$CHECKPOINT_DIR" +mkdir -p "$TENSORBOARD_DIR" +mkdir -p "$OUTPUT_DIR" +echo "OUTPUT TO: ${OUTPUT_DIR}" + +gpt_args=( + "--tensor-model-parallel-size $MPSIZE" + "--pipeline-model-parallel-size $PPSIZE" + "--num-layers $NLAYERS" + "--hidden-size $HIDDEN" "--ffn-hidden-size $FFN_HIDDEN_SIZE" - "--num-attention-heads $NUM_HEADS" - "--micro-batch-size $MICRO_BATCH_SIZE" - "--global-batch-size $GLOBAL_BATCH_SIZE" + "--num-attention-heads $ATEN_HEADS" + "--micro-batch-size $MICRO_BATCH" + "--global-batch-size $GLOBAL_BATCH" "--seq-length $SEQ_LENGTH" "--max-position-embeddings $SEQ_LENGTH" "--train-iters $TRAIN_STEPS" - "--save $CHECKPOINT_PATH" - "--load $CHECKPOINT_PATH" + "--save $CHECKPOINT_DIR" + "--load $CHECKPOINT_DIR" "--data-path $DATASET" "--data-impl mmap" "--tokenizer-type GPTSentencePieceTokenizer" @@ -162,7 +408,28 @@ MEGATRON_ARGS=( "--normalization rmsnorm" "--disable-bias-linear" "--num-key-value-heads $NUM_KV_HEADS" + "--tensorboard-dir ${TENSORBOARD_DIR}" + "--log-timers-to-tensorboard" + "--tensorboard-log-interval 1" + "--data-path $DATA_PATH" + "--vocab-file $VOCAB_FILE" + "--merge-file $MERGE_FILE" ) -MEGATRON_ARGS+="${ds_args}" -MEGATRON_ARGS+="${DATA_LOAD_ARGS}" +# DATA_LOAD_ARGS=( +# "--data-path $DATA_PATH" +# "--vocab-file $VOCAB_FILE" +# "--merge-file $MERGE_FILE" +# ) + +export gpt_args=( + "${gpt_args[*]}" + "${ds_args[*]}" + # "${DATA_LOAD_ARGS[*]}" +) +ARGS="$(join_by ' ' ${gpt_args[*]})" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "ARGS: ${ARGS}" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +# gpt_args+="${ds_args}" +# gpt_args+="${DATA_LOAD_ARGS}" diff --git a/ALCF/model.sh b/ALCF/model.sh index d69ea8033a..ef01f5541d 100755 --- a/ALCF/model.sh +++ b/ALCF/model.sh @@ -15,6 +15,45 @@ declare -A A_NLAYERS declare -A A_HIDDEN declare -A A_ATEN_HEADS +# | =================== Llama 2 Architecture ==================== | +# | Hidden Size | Inter. Size | Atten Heads | Layers | Model Size | +# |:-----------:|:-----------:|:-----------:|:------:|:----------:| +# | 4096 | 11008 | 32 | 32 | 7b | +# | 5120 | 13824 | 40 | 40 | 13b | +# | 8192 | 28672 | 64 | 80 | 70b | + +MODEL_LLAMA_7B_KEY="LLAMA_7B" +A_NLAYERS[$MODEL_LLAMA_7B_KEY]=32 +A_ATEN_HEADS[$MODEL_LLAMA_7B_KEY]=32 +A_HIDDEN[$MODEL_LLAMA_7B_KEY]=4096 + +MODEL_LLAMA_13B_KEY="LLAMA_13B" +A_NLAYERS[$MODEL_LLAMA_13B_KEY]=40 +A_ATEN_HEADS[$MODEL_LLAMA_13B_KEY]=40 +A_HIDDEN[$MODEL_LLAMA_13B_KEY]=5120 + +MODEL_LLAMA_70B_KEY="LLAMA_70B" +A_NLAYERS[$MODEL_LLAMA_70B_KEY]=80 +A_ATEN_HEADS[$MODEL_LLAMA_70B_KEY]=64 +A_HIDDEN[$MODEL_LLAMA_70B_KEY]=8192 + +# HIDDEN_SIZE=4096 +# NUM_LAYERS=24 # e.g. llama-13b: 40 + + +# ┏━━━━━━━━━━━━━━━━━━━━┓ +# ┃ Llama2 ┃ +# ┗━━━━━━━━━━━━━━━━━━━━┛ +# MODEL_SIZE_KEY="LLAMA_24L" +# HIDDEN_SIZE=2048 # e.g. llama-13b: 5120 +# FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824 +# NUM_LAYERS=24 # e.g. llama-13b: 40 +# NUM_HEADS=16 # e.g. llama-13b: 40 +# SEQ_LENGTH=2048 +# NUM_KV_HEADS=4 # llama2 70B uses GQA +# + + # ┏━━━━━━━━━━━━━━━━━━━━┓ # ┃ GPT-3 Small: 125M ┃ # ┗━━━━━━━━━━━━━━━━━━━━┛ diff --git a/pretrain_llama.py b/pretrain_llama.py index bfc10704e4..67bf925492 100644 --- a/pretrain_llama.py +++ b/pretrain_llama.py @@ -51,6 +51,12 @@ ) if RANK == 0 and not DISABLE_WANDB: + # args = get_args() + # assert args is not None + # tensorboard_dir = args.tensorboard_dir + # if args.tensorboard_dir is not None: + # print(f'Setting (in env): {TENSORBOARD_DIR=}') + # os.environ['TENSORBOARD_DIR'] = args.tensorboard_dir project_name = os.environ.get( "WB_PROJECT", os.environ.get("WANDB_PROJECT", "GenSLM-Megatron-DS"), @@ -58,7 +64,7 @@ print("--------------------------------------------------") print(f"Setting up W&B from: {RANK} with {project_name}") print("--------------------------------------------------") - setup_wandb(project_name=project_name) + setup_wandb(project_name=project_name, sync_tensorboard=True) def model_provider(pre_process=True, post_process=True): @@ -66,6 +72,7 @@ def model_provider(pre_process=True, post_process=True): print_rank_0("building GPT model ...") see_memory_usage("Before Building Model", force=True) args = get_args() + assert args is not None config = core_transformer_config_from_args(args) # args = get_args() # timers = get_timers() @@ -151,23 +158,21 @@ def get_batch(data_iterator): """Generate a batch""" args = get_args() tokenizer = get_tokenizer() - + assert args is not None + assert tokenizer is not None # Items and their type. keys = ["text"] datatype = torch.int64 - # Broadcast data. if data_iterator is not None: data = next(data_iterator) else: data = None data_b = tensor_parallel.broadcast_data(keys, data, datatype) - # Unpack. tokens_ = data_b["text"].long() labels = tokens_[:, 1:].contiguous() tokens = tokens_[:, :-1].contiguous() - # Get the masks and postition ids. skip_mask = ( hasattr(args, "use_flash_attn") @@ -182,33 +187,29 @@ def get_batch(data_iterator): args.eod_mask_loss, skip_mask, ) - # For DS's sequence parallel seq_parallel_world_size = mpu.get_sequence_parallel_world_size() seq_parallel_world_rank = mpu.get_sequence_parallel_rank() - # For Megatron's sequence parallel if args.sequence_parallel: seq_parallel_world_size = mpu.get_tensor_model_parallel_world_size() seq_parallel_world_rank = mpu.get_tensor_model_parallel_rank() seq_length = tokens.size(1) - assert seq_length % seq_parallel_world_size == 0 sub_seq_length = seq_length // seq_parallel_world_size sub_seq_start = seq_parallel_world_rank * sub_seq_length sub_seq_end = (seq_parallel_world_rank + 1) * sub_seq_length - tokens = tokens[:, sub_seq_start:sub_seq_end] position_ids = position_ids[:, sub_seq_start:sub_seq_end] # For DS's sequence parallel if mpu.get_sequence_parallel_world_size() > 1: labels = labels[:, sub_seq_start:sub_seq_end] - return tokens, labels, loss_mask, attention_mask, position_ids def data_post_process(data, data_sampler_state_dict): args = get_args() + assert args is not None if args.data_efficiency_curriculum_learning: if ( "seqlen_truncate" in data_sampler_state_dict[ @@ -268,6 +269,7 @@ def get_batch_pipe(data): `data_iterator`""" args = get_args() tokenizer = get_tokenizer() + assert tokenizer is not None and args is not None # Items and their type. keys = ["text"] @@ -310,25 +312,36 @@ def loss_func(loss_mask, moe_loss, mos_loss, output_tensor): losses = output_tensor.float() loss_mask = loss_mask.view(-1).float() loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() - # Reduce loss for logging. averaged_loss = average_losses_across_data_parallel_group([loss]) - if args.mos or args.kd: + if args.mos or args.kd: # type:ignore # assert max(args.num_experts) >= 1 loss = loss + moe_loss + mos_loss - if args.mos: - return loss, { + if args.mos: # type:ignore + # return loss, { + # "total loss": loss, + # "lm loss": averaged_loss[0], + # "moe loss": moe_loss, + # "mos loss": mos_loss, + # } + losses = { "total loss": loss, "lm loss": averaged_loss[0], "moe loss": moe_loss, "mos loss": mos_loss, } - elif args.kd: - return loss, { - "total loss": loss, - "lm loss": averaged_loss[0], - "moe loss": moe_loss, - "kd loss": mos_loss, + elif args.kd: # type:ignore + # return loss, { + # "total loss": loss, + # "lm loss": averaged_loss[0], + # "moe loss": moe_loss, + # "kd loss": mos_loss, + # } + losses = { + "total-loss": loss, + "lm-loss": averaged_loss[0], + "moe-loss": moe_loss, + "kd-loss": mos_loss, } print_rank_0( ">>> total loss: {}, lm loss {}, kd loss {}".format( @@ -336,15 +349,27 @@ def loss_func(loss_mask, moe_loss, mos_loss, output_tensor): ) ) else: - if max(args.num_experts) <= 1: - return loss, {"lm loss": averaged_loss[0]} + if max(args.num_experts) <= 1: # type:ignore + losses = {"lm-loss": averaged_loss[0]} + # return loss, {"lm loss": averaged_loss[0]} else: loss = loss + moe_loss - return loss, {"lm loss": averaged_loss[0], "moe loss": moe_loss} + losses = {"lm-loss": averaged_loss[0], "moe loss": moe_loss} + # return loss, {"lm loss": averaged_loss[0], "moe loss": moe_loss} + if wandb is not None and wandb.run is not None: + # wandb.run.log({}) + losses |= {'loss': loss} + wandb.run.log({f"Loss/{k}": v for k, v in losses.items()}) + return loss, losses def calculate_mos_loss( - args, stu_output, teacher_model, tokens, position_ids, attention_mask + args, + stu_output, + teacher_model, + tokens, + position_ids, + attention_mask, ): mos_loss = 0 alpha = args.kd_alpha_ce @@ -399,6 +424,7 @@ def forward_step(data_iterator, model): """Forward step.""" args = get_args() timers = get_timers() + assert timers is not None and args is not None # Get the batch. timers("batch-generator", log_level=2).start() @@ -407,8 +433,8 @@ def forward_step(data_iterator, model): ) timers("batch-generator").stop() - if args.data_efficiency_curriculum_learning: - args.curriculum_seqlen = tokens.size()[1] + if args.data_efficiency_curriculum_learning: # type: ignore + args.curriculum_seqlen = tokens.size()[1] # type: ignore if ( hasattr( args, @@ -423,12 +449,13 @@ def forward_step(data_iterator, model): torch.numel(tokens) ) - if args.mos or args.kd: + assert args is not None + if args.mos or args.kd: # type:ignore # The forward func can return either the loss or the logits, depending # on whether passing in the labels or not. stu_output, other_losses = model(tokens, position_ids, attention_mask) if ( - args.curriculum_learning_legacy + args.curriculum_learning_legacy # type:ignore and args.curriculum_seqlen < args.seq_length ): assert args.curriculum_seqlen is not None @@ -473,7 +500,7 @@ def forward_step(data_iterator, model): def train_valid_test_datasets_provider(train_val_test_num_samples): """Build train, valid, and test datasets.""" args = get_args() - + assert args is not None print_rank_0( "> building train, validation, and test datasets " "for GPT ..." ) From 657bb3e312e793d7b503f475b59e44c1aee44205 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 21 Dec 2023 17:47:44 -0600 Subject: [PATCH 028/268] Add `ALCF/train-llama.sh` --- ALCF/train-llama.sh | 74 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100755 ALCF/train-llama.sh diff --git a/ALCF/train-llama.sh b/ALCF/train-llama.sh new file mode 100755 index 0000000000..6483b55c54 --- /dev/null +++ b/ALCF/train-llama.sh @@ -0,0 +1,74 @@ +#!/bin/bash --login + +TSTAMP=$(date "+%Y-%m-%d-%H%M%S") + +# HERE=$(python3 -c 'import os; print(os.getcwd())') +# ALCF_DIR="${HERE}/ALCF" +# +function WhereAmI() { + python3 -c 'import os; print(os.getcwd())' +} + +HERE=$(WhereAmI) +# ALCF_DIR=$(find "${HERE}" -name "ALCF") +ALCF_DIR="${HERE}/ALCF" + + +# ALCF_DIR="$(dirname $(dirname $(python3 -c 'import megatron; print(megatron.__file__)' | tail -1)))/ALCF" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" +echo "ALCF_DIR: ${ALCF_DIR}" +echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" + +# SOURCE=${BASH_SOURCE[0]} +# while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# SOURCE=$(readlink "$SOURCE") +# [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +# done +# DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) +# + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ Make sure we're not already running; if so, exit here ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +PIDS=$(ps aux | egrep "$USER.+mpi.+pretrain.+.py" | grep -v grep | awk '{print $2}') +if [ -n "${PIDS}" ]; then + echo "Already running! Exiting!" + exit 1 +fi + +function sourceFile() { + FILE="$1" + echo "source-ing ${FILE}" + if [[ -f "${FILE}" ]]; then + # shellcheck source="${FILE}" + source "${FILE}" + else + echo "ERROR: UNABLE TO SOURCE ${FILE}" + fi +} + +#┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +#┃ source ./launch.sh ┃ +#┃ which then sources ./{args.sh,setup.sh} ┃ +#┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ +LAUNCH_FILE="${ALCF_DIR}/launch.sh" + +sourceFile "${ALCF_DIR}/setup.sh" +# sourceFile "${ALCF_DIR}/model.sh" +# sourceFile "${ALCF_DIR}/args.sh" +sourceFile "${ALCF_DIR}/llama2_vars.sh" +sourceFile "${LAUNCH_FILE}" + +setup +# singleGPU "$@" 2>&1 & +# fullNode "$@" 2>&1 & +TORCH_VERSION=$(python3 -c 'import torch; print(torch.__version__)') +export TORCH_VERSION=$TORCH_VERSION +export CUDA_DEVICE_MAX_CONNECTIONS=1 +# elasticDistributed "$@" 2>&1 & +# elasticDistributed "$@" +# PID=$! +# wait $PID +elasticDistributed "$@" 2>&1 & + From 94e91d607f7f6ed2d99806c3a1d3491149af2c67 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Jan 2024 09:38:00 -0600 Subject: [PATCH 029/268] Update `DATA_PARENT` in `ALCF/args.sh` --- ALCF/args.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ALCF/args.sh b/ALCF/args.sh index 08d3d8fcd2..fb66f82b12 100755 --- a/ALCF/args.sh +++ b/ALCF/args.sh @@ -163,7 +163,12 @@ if [[ $(hostname) == nid* || $(hostname) == login* ]]; then DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" DATA_TYPE="BookCorpusDataset_text_document" elif [[ $(hostname) == theta* || $(hostname) == x3* ]]; then - DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" + # DATA_PARENT="/lus/grand" + DATA_PARENT="/lus/eagle/projects/datasets/Megatron-DeepSpeed/GenSLMSubSample200k" # GenSLMSubSample200k" + # DATA_TYPE="GenSLMSubSample200k" + # DATA_PARENT="/home/foremans/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed" + # DATA_TYPE="books-0001_text_document" + # DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" DATA_TYPE="genslm_subsample_200k_sequence_document" else echo "Unable to determine DATA_PARENT for $(hostname)." From d4545764b4a9d5b9155fc2a11a8f22fb9eeae00a Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Jan 2024 10:48:34 -0600 Subject: [PATCH 030/268] Update `pretrain_llama.py` --- pretrain_llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pretrain_llama.py b/pretrain_llama.py index 67bf925492..ab7ffc785c 100644 --- a/pretrain_llama.py +++ b/pretrain_llama.py @@ -64,7 +64,7 @@ print("--------------------------------------------------") print(f"Setting up W&B from: {RANK} with {project_name}") print("--------------------------------------------------") - setup_wandb(project_name=project_name, sync_tensorboard=True) + setup_wandb(project_name=project_name) def model_provider(pre_process=True, post_process=True): From 4ab9532dd094158c1e1dc321a8816d38eb4da1ae Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Jan 2024 10:48:56 -0600 Subject: [PATCH 031/268] Update `ALCF/args.sh` --- ALCF/args.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ALCF/args.sh b/ALCF/args.sh index fb66f82b12..17e0018110 100755 --- a/ALCF/args.sh +++ b/ALCF/args.sh @@ -163,7 +163,6 @@ if [[ $(hostname) == nid* || $(hostname) == login* ]]; then DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" DATA_TYPE="BookCorpusDataset_text_document" elif [[ $(hostname) == theta* || $(hostname) == x3* ]]; then - # DATA_PARENT="/lus/grand" DATA_PARENT="/lus/eagle/projects/datasets/Megatron-DeepSpeed/GenSLMSubSample200k" # GenSLMSubSample200k" # DATA_TYPE="GenSLMSubSample200k" # DATA_PARENT="/home/foremans/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed" @@ -181,7 +180,7 @@ DATA_PATH="${DATA_DIR}/${DATA_TYPE}" VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" # # -# [ "$(hostname)==login*" ] && DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" +# [ "$(hostname)==login*" ] && DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed"_ # [ "$(hostname)==nid*" ] && DATA_PARENT="/global/homes/f/foremans/m3957/foremans/projects/saforem2/Megatron-DeepSpeed" # # [ "$(hostname)==theta*" ] && DATA_PARENT="/lus/eagle/projects/datasets/BookCorpusDataset" # [ "$(hostname)==theta*" ] && DATA_PARENT="/lus/grand/projects/fallwkshp23/datasets/GenSLMSubSample200k" From d08b868244bd2ce9c49a1cb4dd7a518a9edc8337 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Jan 2024 10:49:23 -0600 Subject: [PATCH 032/268] Update `.gitignore` --- .gitignore | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 64270f0752..fc81539941 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +*.gz +*.txt +*.idx +*.bin +*.log __pycache__ # Distribution / packaging @@ -20,4 +25,4 @@ slurm* logs # Data folder -bookcorpus_data/ \ No newline at end of file +bookcorpus_data/ From e4bac7019db92cfe49d6eeac00b5721652e516fe Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Jan 2024 15:30:31 -0600 Subject: [PATCH 033/268] Create README.md --- ALCF/README.md | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 ALCF/README.md diff --git a/ALCF/README.md b/ALCF/README.md new file mode 100644 index 0000000000..89fb639da8 --- /dev/null +++ b/ALCF/README.md @@ -0,0 +1,93 @@ +# Megatron-DeepSpeed (@ [ALCF](https://alcf.anl.gov)) + +![image](https://github.com/argonne-lcf/Megatron-DeepSpeed/assets/5234251/f06df155-30e8-4894-a4c2-c17ff4b34ada) + +We describe below the instructions for launching distributed training with +Microsoft's Megatron-DeepSpeed and briefly describe some parallelism +strategies and various optimizations that are supported. + +> [!IMPORTANT] +> We maintain this (forked) version at +> [`argonne-lcf/Megatron-DeepSpeed`](https://github.com/argonne-lcf/Megatron-DeepSpeed) +> that has some [helper scripts](#helper-scripts) for launching and setting +> various training options. +> +> These changes are entirely self-contained **HERE** in [`ALCF/`](.) + +## Setup + +1. Load `conda` and activate base environment: + + ```bash + # load conda + activate base env + module load conda/2023-10-04 ; conda activate base + ``` + +1. Clone + [`argonne-lcf/Megatron-DeepSpeed`](https://github.com/argonne-lcf/Megatron-DeepSpeed) + and navigate into it: + + ```bash + # clone + navigate into Megatron-DeepSpeed repo + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed + cd Megatron-DeepSpeed + ``` + +1. Make virtual environment (on top of base conda): + + ```bash + # make virtual environment (on top of base conda) + mkdir -p venvs/polaris/2023-10-04 + python3 -m venv venvs/polaris/2023-10-04 --system-site-packages + source venvs/polaris/2023-10-04/bin/activate + ``` + +1. Install missing dependency: + + ```bash + # install *missing dependency + python3 -m pip install "git+https://github.com/saforem2/ezpz" + ``` + +1. Launch training: + + ```bash + # ---- launch training ----------------------- + # - MODEL_SIZE_KEY: defined in ALCF/model.sh + # - other args: defined in ALCF/args.sh + # --------------------------------------------- + MODEL_SIZE_KEY="GPT25B" \ + SEQ_LEN=4096 \ + USE_FLASH_ATTN_V2=1 \ + MICRO_BATCH=1 \ + GAS=1 \ + SP_TYPE="megatron" \ + ZERO_STAGE=1 \ + ./ALCF/train-gpt3.sh + ``` + + +## Helper Scripts + +- 📂 [`ALCF/`](https://github.com/argonne-lcf/Megatron-DeepSpeed/tree/main/ALCF) + `├──` [`args.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/models.sh) + `├──` [`launch.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/launch.sh) + `├──` [`model.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/model.sh) + `├──` [`setup.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/setup.sh) + `├──` [`submit-pbs.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/submit-pbs.sh) + `├──` [`submit.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/submit.sh) + `└──` [`train-gpt3.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/train-gpt3.sh) + + +
+
ALCF/train-gpt3.sh +
Main entry point for training. This script will automatically source the rest of the required ALCF/*.sh scripts below
+
ALCF/model.sh
+
Contains some example model architectures for GPT3-style models
+
ALCF/args.sh
+
Logic for parsing / setting up runtime options for Megatron and DeepSpeed.
+
ALCF/setup.sh
+
Locate and activate virtual environment to be used, ensure MPI variables are set properly
+
ALCF/launch.sh
+
Identify available resources and build the command to be ran i.e. figure out how many: `{nodes, GPUs per node, GPUs total}`, to pass to `mpi{run,exec}` then, use this to build `mpiexec {mpiexec-args} python3 pretrain_gpt.py`
+
From aa7a580108ced77c0dd0335542bc228c008e3d87 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Thu, 1 Feb 2024 00:41:41 -0600 Subject: [PATCH 034/268] added support for training with multiple files --- megatron/arguments.py | 6 ++- megatron/tokenizer/tokenizer.py | 79 ++++++++++++++++++++++++--------- pretrain_gpt.py | 45 ++++++++++++++----- 3 files changed, 96 insertions(+), 34 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 30a95266b2..083bf30f6a 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -1203,6 +1203,9 @@ def _add_data_args(parser): 'single dataset used for all three: train, valid ' 'and test. It is exclusive to the other ' '--*-data-path args') + group.add_argument('--data-file-list', type=str, default=None, + help='The file with the list of dataset and weights') + group.add_argument('--split', type=str, default='969, 30, 1', help='Comma-separated list of proportions for training,' ' validation, and test split. For example the split ' @@ -1264,7 +1267,8 @@ def _add_data_args(parser): 'SentencePieceTokenizer', 'GPTSentencePieceTokenizer', 'HFTokenizer', - 'NullTokenizer'], + 'NullTokenizer', + 'Llama2Tokenizer'], help='What type of tokenizer to use.') group.add_argument('--tokenizer-model', type=str, default=None, help='Sentencepiece tokenizer model.') diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 43c251bab1..023c2f756c 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -8,6 +8,8 @@ from transformers import AutoTokenizer from .bert_tokenization import FullTokenizer as FullBertTokenizer from .gpt2_tokenization import GPT2Tokenizer + + def build_tokenizer(args): """Initialize tokenizer.""" if args.rank == 0: @@ -35,12 +37,15 @@ def build_tokenizer(args): elif args.tokenizer_type == 'GPTSentencePieceTokenizer': assert args.tokenizer_model is not None tokenizer = _GPTSentencePieceTokenizer(args.tokenizer_model) + elif args.tokenizer_type == 'Llama2Tokenizer': + assert args.tokenizer_model is not None + tokenizer = _Llama2Tokenizer(args.tokenizer_model) elif args.tokenizer_type == 'NullTokenizer': assert args.vocab_size is not None tokenizer = _NullTokenizer(args.vocab_size) elif args.tokenizer_type == 'HFTokenizer': assert args.tokenizer_model is not None - tokenizer = _HFTokenizer(args.tokenizer_model,args.seq_length) + tokenizer = _HFTokenizer(args.tokenizer_model) else: raise NotImplementedError('{} tokenizer is not ' 'implemented.'.format(args.tokenizer_type)) @@ -504,6 +509,56 @@ def eod(self): def additional_special_tokens_ids(self): return None + +class _Llama2Tokenizer(_SentencePieceTokenizer): + """SentencePieceTokenizer-Megatron wrapper""" + + def __init__(self, model_file,): + super().__init__(model_file, vocab_extra_ids=0) + + def _initalize(self, vocab_extra_ids): + self._populate_vocab() + + # BOS / EOS token IDs + self.n_words: int = self.tokenizer.vocab_size() + self.bos_id: int = self.tokenizer.bos_id() + self.eos_id: int = self.tokenizer.eos_id() + self.pad_id: int = self.tokenizer.pad_id() + assert self.tokenizer.vocab_size() == self.tokenizer.get_piece_size() + + def tokenize(self, s: str, bos=True, eos=False): + '''Default args for text completion, not chat/dialog.''' + assert type(s) is str + t = self.tokenizer.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def detokenize(self, ids): + return self.tokenizer.decode_ids(ids) + + @property + def cls(self): + return -1 + + @property + def sep(self): + return -1 + + @property + def mask(self): + return -1 + + @property + def eod(self): + return self.eos_id + + @property + def additional_special_tokens_ids(self): + return None + class _NullTokenizer: def __init__(self, vocab_size): vocab_size = int(vocab_size) @@ -540,28 +595,10 @@ def additional_special_tokens_ids(self): class _HFTokenizer(AbstractTokenizer): """HF Tokenizer""" - def __init__(self, tokenizer_name_or_path,max_seq_len): + def __init__(self, tokenizer_name_or_path): name = tokenizer_name_or_path super().__init__(name) - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path,padding_side="right",use_fast=False) - - DEFAULT_PAD_TOKEN = "[PAD]" - DEFAULT_EOS_TOKEN = "" - DEFAULT_BOS_TOKEN = "" - DEFAULT_UNK_TOKEN = "" - special_tokens_dict = dict() - if self.tokenizer.pad_token is None: - special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN - if self.tokenizer.eos_token is None: - special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN - if self.tokenizer.bos_token is None: - special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN - if self.tokenizer.unk_token is None: - special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN - self.tokenizer.add_special_tokens(special_tokens_dict) - # if self.tokenizer.pad_token == None: - # self.tokenizer.pad_token= "[PAD]" - self.tokenizer.model_max_length = max_seq_len + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) self.encoder = self.tokenizer.get_vocab() self.decoder = {v: k for k, v in self.encoder.items()} diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 86bfb74713..0c82e0ff0e 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -143,7 +143,6 @@ # if model_size is not None: # wandb.run.config.update({'MODEL_SIZE': model_size}) - def model_provider(pre_process=True, post_process=True): """Build the model.""" print_rank_0('building GPT model ...') @@ -219,7 +218,6 @@ def model_provider(pre_process=True, post_process=True): wandb.run.config.update({'num_params': num_params}) return model - def get_batch(data_iterator): """Generate a batch""" args = get_args() @@ -275,6 +273,7 @@ def get_batch(data_iterator): return tokens, labels, loss_mask, attention_mask, position_ids + def data_post_process(data, data_sampler_state_dict): args = get_args() if args.data_efficiency_curriculum_learning: @@ -300,6 +299,7 @@ def data_post_process(data, data_sampler_state_dict): args.data_efficiency_curriculum_learning_seqlen_type = None return data + def get_batch_pipe(data): """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" args = get_args() @@ -385,6 +385,7 @@ def calculate_mos_loss(args, stu_output, teacher_model, tokens, position_ids, at mos_loss = mos_loss.div(args.seq_length) * beta return mos_loss + def forward_step(data_iterator, model): """Forward step.""" args = get_args() @@ -438,8 +439,24 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): print_rank_0('> building train, validation, and test datasets ' 'for GPT ...') + files = [] + if args.data_file_list is not None: + with open(args.data_file_list, 'r') as flist: + for f in flist.readlines(): + w, fname = f.split() + files.append(float(w)) + files.append(fname) + elif len(args.data_path)==1 and os.path.isdir(args.data_path[0]): + path=args.data_path[0] + "/" + for f in os.listdir(path): + if (os.path.isfile(path + f) and f.find(".bin")!=-1): + files.append(1) + files.append(path + f.split(".bin")[0]) + else: + files = args.data_path + print_rank_0(f"file list {files}") train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - data_prefix=args.data_path, + data_prefix=files, data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, @@ -485,15 +502,19 @@ def git_ds_info(): def main(): # if RANK == 0: # setup_wandb() - - model = pretrain( - train_valid_test_datasets_provider, - model_provider, - ModelType.encoder_or_decoder, - forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, - data_post_process=data_post_process - ) + from torch.profiler import profile, record_function, ProfilerActivity + with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: + model = pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + data_post_process=data_post_process + ) + args = get_args() + prof.export_chrome_trace(f"{args.tensorboard_dir}/torch-trace-{RANK}-of-{WORLD_SIZE}.json") + # # from megatron.training import get_model # if wandb.run is not None: # args = get_args() From 258269c08c7b429a59ac9c7d09289f686aab9f52 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Thu, 1 Feb 2024 00:44:21 -0600 Subject: [PATCH 035/268] added training script --- llama.sh | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100755 llama.sh diff --git a/llama.sh b/llama.sh new file mode 100755 index 0000000000..323a5f4730 --- /dev/null +++ b/llama.sh @@ -0,0 +1,64 @@ +#!/bin/bashOA +#PBS -l walltime=0:30:00 +#PBS -A datascience +#PBS -q debug-scaling +#PBS -l select=6 +#PBS -l filesystems=eagle:grand:home +export PPN=4 +export MD=/home/hzheng/ALCF-Megatron-DeepSpeed +module load conda/2023-10-04 +#conda activate /soft/datascience/megatron-deepspeed/2023-10-04 +conda activate $HOME/PolarisAT/pyenvs/megatron/2023-10-04 +cd ${PBS_O_WORKDIR} +export PBS_JOBSIZE=$(cat $PBS_NODEFILE | uniq | wc -l) +export TP=1 +export PP=1 +export MBS=1 +export BS=$((MBS*PBS_JOBSIZE*PPN/PP/TP)) +#export DATA_PATH="/eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/" +export DATA_PATH="/eagle/datasets//dolma/data_Llama2Tokenizer/wiki-en-simple/" +export DATA_FILE_LIST="/eagle/datasets//dolma//data_file_list_small.txt" +echo "BS: $BS\n PP:$PP \n TP: $TP, PBS_JOBSIZE: $PBS_JOBSIZE" +MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --hostfile $PBS_NODEFILE python3 ./pretrain_gpt.py \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --num-layers 32 \ + --hidden-size 4096 \ + --ffn-hidden-size 5504 \ + --num-attention-heads 32 \ + --micro-batch-size ${MBS} \ + --global-batch-size ${BS} \ + --seq-length 2048 \ + --max-position-embeddings 2048 \ + --train-iters 10 \ + --save ${MD}/checkpoints/LLAMA_7B_LLAMA_7B_z2_seqlen_mp1_pp1_sp24_nl32_hs4096_gb${BS}_mb1 \ + --load ${MD}/checkpoints/LLAMA_7B_LLAMA_7B_z2_seqlen_mp1_pp1_sp24_nl32_hs4096_gb${BS}_mb1 \ + --data-impl mmap \ + --tokenizer-type Llama2Tokenizer \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 3e-4 \ + --lr-decay-style cosine \ + --min-lr 3e-5 \ + --weight-decay 0.1 \ + --clip-grad 1 \ + --lr-warmup-iters 2 \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --log-interval 1 \ + --cpu-optimizer \ + --save-interval 10000 \ + --eval-interval 1000 \ + --eval-iters 10 --fp16 \ + --no-query-key-layer-scaling \ + --attention-dropout 0 \ + --hidden-dropout 0 \ + --use-rotary-position-embeddings \ + --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ + --untie-embeddings-and-output-weights \ + --swiglu --normalization layernorm --disable-bias-linear --num-key-value-heads 4 --tensorboard-dir ${MD}/outputs/LLAMA_7B_LLAMA_7B_z3_seqlen_mp1_pp1_sp24_nl32_hs4096_gb24_mb1/tensorboard --log-timers-to-tensorboard --tensorboard-log-interval 1 \ + --data-file-list ${DATA_FILE_LIST} \ + --data-path ${DATA_PATH} \ + --vocab-file ${MD}/dataset/gpt2-vocab.json --merge-file ${MD}/dataset/gpt2-merges.txt \ + --zero-stage=2 --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed From d320cefaf5ba7dc23ee0fc032ef81304c7edc8fd Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 1 Feb 2024 13:10:43 -0600 Subject: [PATCH 036/268] Update `pretrain_gpt.py` --- pretrain_gpt.py | 251 ++++-------------------------------------------- 1 file changed, 20 insertions(+), 231 deletions(-) diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 86bfb74713..785a129156 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -2,15 +2,11 @@ """Pretrain GPT""" -import os import torch import math -# import logging - from functools import partial from megatron import get_args from megatron import print_rank_0 -from rich import print from megatron import get_timers from megatron import get_tokenizer from megatron.core import mpu, tensor_parallel @@ -21,144 +17,32 @@ from megatron.utils import get_ltor_masks_and_position_ids from megatron.utils import average_losses_across_data_parallel_group, update_rotary_pos_emb from megatron.arguments import core_transformer_config_from_args -from megatron.utils import ( - report_memory, - throughput_calculator, - checkpoint_throughput_calculator -) -from pathlib import Path import deepspeed from deepspeed.runtime.utils import see_memory_usage from deepspeed.accelerator.real_accelerator import get_accelerator +import os import subprocess -import wandb -import time from torch import nn import torch.nn.functional as F -# from ezpz import get_logger -from ezpz.dist import setup_torch, get_world_size, setup_wandb - -RANK = setup_torch( - backend='deepspeed', - port='5432', -) -WORLD_SIZE = get_world_size() -LEVEL = "DEBUG" if RANK == 0 else "CRITICAL" - -WANDB_MODE = os.environ.get('WANDB_MODE', None) -DISABLE_WANDB = WANDB_MODE is not None and str(WANDB_MODE).lower() == 'disabled' - -if RANK == 0 and not DISABLE_WANDB: - project_name = ( - os.environ.get( - 'WB_PROJECT', - os.environ.get( - 'WANDB_PROJECT', - 'GenSLM-Megatron-DS' - ), - ) - ) - print('--------------------------------------------------') - print(f"Setting up W&B from: {RANK} with {project_name}") - print('--------------------------------------------------') - setup_wandb(project_name=project_name) - - -# os.environ[''] -# wblogger = logging.getLogger("wandb") -# wblogger.setLevel(logging.DEBUG) - -# log = get_logger(__name__, level=LEVEL) -# -# log.critical(f"Hello from rank: {RANK} / {WORLD_SIZE} !") - -import socket -from typing import Optional -# log.critical(f"Setting up W&B from rank: {RANK} with {wb_project_name}") - - -# def setup_wandb(project_name: Optional[str] = None): -# print(f"Setting up W&B from: {RANK}") -# project_name = ( -# os.environ.get('WB_PROJECT', 'GenSLM-Megatron-DS') -# if project_name is None else project_name -# ) -# print(f"Setting up wandb from rank: {RANK}") -# print(f"Using: WB PROJECT: {project_name}") -# # if get_rank() == 0: -# # tensorboard_dir = args.tensorboard_dir -# tensorboard_dir = None -# # if config is None: -# tensorboard_dir = os.environ.get('TENSORBOARD_DIR', None) -# # else: -# # tensorboard_dir = ( -# # config.get( -# # 'tensorboard_dir', -# # None, # os.getcwd() -# # ) -# # ) -# if tensorboard_dir is not None: -# print(f'Patching tensorboard from {tensorboard_dir}') -# wandb.tensorboard.patch(root_logdir=tensorboard_dir) -# # wbrun_id = wandb.util.generate_id() -# current_time = time.time() -# # local_time = time.localtime(current_time) -# # if wandb.run is None: -# wandb.init( -# resume='allow', -# sync_tensorboard=(tensorboard_dir is not None), # True, -# project=(project_name if project_name is not None else None), -# # dir=(tensorboard_dir if tensorboard_dir is not None else None), -# ) -# assert wandb.run is not None -# print(f"W&B RUN: [{wandb.run.name}]({wandb.run.url})") -# wandb.run.config.update({'current_time': current_time}) -# model_size = os.environ.get('MODEL_SIZE', None) -# wandb.run.config.update({'world_size': get_world_size()}) -# # if config is not None: -# # wandb.run.config.update(config) -# env = { -# k: v for k, v in dict(os.environ).items() -# if not k.startswith('_ModuleTable') -# } -# _ = env.pop('LS_COLORS', None) -# _ = env.pop('PS1', None) -# wandb.run.config.update({'env': env}) -# hostname = socket.gethostbyaddr(socket.gethostname())[0] -# if hostname.startswith('theta'): -# wandb.run.config.update({'machine': 'ThetaGPU'}) -# elif hostname.startswith('x3'): -# wandb.run.config.update({'machine': 'Polaris'}) -# elif hostname.startswith('x1'): -# wandb.run.config.update({'machine': 'Sunspot'}) -# elif hostname.startswith('nid'): -# wandb.run.config.update({'machine': 'Perlmutter'}) -# elif hostname.startswith('login'): -# wandb.run.config.update({'machine': 'NERSC'}) -# else: -# wandb.run.config.update({'machine': hostname}) -# if model_size is not None: -# wandb.run.config.update({'MODEL_SIZE': model_size}) - def model_provider(pre_process=True, post_process=True): """Build the model.""" + print_rank_0('building GPT model ...') - see_memory_usage("Before Building Model", force=True) + see_memory_usage(f"Before Building Model", force=True) + args = get_args() config = core_transformer_config_from_args(args) - # args = get_args() - # timers = get_timers() - if wandb.run is not None: - print(f"Updating WandB run: [{wandb.run.name}]({wandb.run.url})") - wandb.run.config.update({"args": vars(args)}) - if RANK == 0: - git_ds_info() - - with deepspeed.zero.Init(sequence_data_parallel_group=mpu.get_sequence_data_parallel_group(), + if hasattr(mpu, 'get_sequence_parallel_group'): + dpg = mpu.get_sequence_parallel_group() + elif hasattr(mpu, 'get_data_parallel_group'): + dpg = mpu.get_data_parallel_group() + else: + dpg = None + with deepspeed.zero.Init(data_parallel_group=dpg, remote_device=None if args.remote_device == 'none' else args.remote_device, config_dict_or_path=args.deepspeed_config_dict, enabled=args.zero_stage == 3, @@ -202,21 +86,7 @@ def model_provider(pre_process=True, post_process=True): pre_process=pre_process, post_process=post_process ) - num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) - # print_rank_0('\n ------------------------ ') - # print_rank_0(f'num of parameters {num_params}') - # print_rank_0('------------------------\n ') - print_rank_0(80 * '-') - print_rank_0(f"Number of parameters in model: {num_params}") - print_rank_0(80 * '-') - see_memory_usage("After Building Model", force=True) - if wandb.run is not None: - wandb.run.watch( - model, - log='all', - log_graph=True, - ) - wandb.run.config.update({'num_params': num_params}) + see_memory_usage(f"After Building Model", force=True) return model @@ -242,8 +112,7 @@ def get_batch(data_iterator): tokens = tokens_[:, :-1].contiguous() # Get the masks and postition ids. - skip_mask = hasattr(args, 'use_flash_attn') or hasattr(args, 'flash_attn_triton') - # skip_mask = args.use_flash_attn or args.use_flash_attn_triton + skip_mask = args.use_flash_attn or args.use_flash_attn_triton attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( tokens, tokenizer.eod, @@ -482,91 +351,11 @@ def git_ds_info(): print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') -def main(): - # if RANK == 0: - # setup_wandb() - - model = pretrain( - train_valid_test_datasets_provider, - model_provider, - ModelType.encoder_or_decoder, - forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, - data_post_process=data_post_process - ) - # # from megatron.training import get_model - # if wandb.run is not None: - # args = get_args() - # timers = get_timers() - # # model = get_model(model_provider, ModelType.encoder_or_decoder) - # elapsed_time = timers('interval-time').elapsed(barrier=True) - # total_iterations = os.environ.get( - # "TOTAL_ITERATIONS", - # (args.train_iters + args.eval_iters) - # ) - # seq_len = args.seq_length - # elapsed_time_per_iteration = elapsed_time / total_iterations - # if model is not None: - # samples_per_sec, tflops, approx_params_in_billions = throughput_calculator( - # model, - # args, - # elapsed_time, - # total_iterations, - # ) - # # Compute throughput. - # samples_per_sec_per_replica = samples_per_sec / args.data_parallel_size - # tokens_per_sec = samples_per_sec * seq_len - # tokens_per_sec_per_replica = tokens_per_sec / args.data_parallel_size - # sample_consumption_rate = args.consumed_train_samples / elapsed_time - # token_consumption_rate = args.consumed_train_tokens / elapsed_time - # # Tensorboard values. - # tdata = { - # # 'iteration': iteration, - # 'consumed_train_samples': args.consumed_train_samples, - # 'consumed_train_tokens': args.consumed_train_tokens, - # # 'learning_rate': learning_rate, - # # 'batch_size': batch_size, - # # 'loss_scale': loss_scale, - # # 'grad_norm': grad_norm, - # } - # # for key in loss_dict: - # # tdata[f'lm-loss/{key}'] = loss_dict[key] - # - # tdata = {f'train/{k}': v for k, v in tdata.items()} - # # if wbrun is not None and wbrun is wandb.run: - # if wandb.run is not None: - # wandb.run.log(tdata, commit=False) - # tput = { - # 'throughput/iteration-time': elapsed_time_per_iteration, # 1000 ms / s - # 'throughput/samples_per_sec': samples_per_sec, - # 'throughput/samples_per_sec_per_replica': samples_per_sec_per_replica, - # 'throughput/tokens_per_sec': tokens_per_sec, - # 'throughput/tokens_per_sec_per_replica': tokens_per_sec_per_replica, - # 'throughput/tflops': tflops, - # 'throughput/approx_params_in_billions': approx_params_in_billions, - # 'throughput/sample_consumption_rate': sample_consumption_rate, - # 'throughput/token_consumption_rate': token_consumption_rate, - # 'throughput/elapsed_ms_per_iteration': elapsed_time_per_iteration, - # } - # wandb.run.log(tput) - return model - - - if __name__ == "__main__": - # git_ds_info() - # pretrain(train_valid_test_datasets_provider, - # model_provider, - # ModelType.encoder_or_decoder, - # forward_step, - # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, - # data_post_process=data_post_process) - import sys - import deepspeed.comm as dist - model = main() - dist.log_summary() - if wandb.run is not None: - print(f"wandb.run.name: {wandb.run.name}") - print(f"wandb.run.url: {wandb.run.url}") - wandb.finish() - sys.exit() + git_ds_info() + pretrain(train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + data_post_process=data_post_process) From 080003144a529f588c9f8b9d2d37472ae6d725d3 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 1 Feb 2024 13:12:02 -0600 Subject: [PATCH 037/268] Add `set_params.sh` --- set_params.sh | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 set_params.sh diff --git a/set_params.sh b/set_params.sh new file mode 100644 index 0000000000..40682a9fe3 --- /dev/null +++ b/set_params.sh @@ -0,0 +1,110 @@ +#!/bin/bash login +# echo "!!!please use generate_hostfile.sh to set hostfile for 18 nodes before training" +export WORLD_SIZE=${WORLD_SIZE:-216} +export MICRO_BATCH=${MICRO_BATCH:-1} +export NLAYERS=${NLAYERS:-96} +export HIDDEN=${HIDDEN:-12288} +export HEADS=${HEADS:-96} +export SEQ=${SEQ:-2048} +export TRAIN_ITER=${TRAIN_ITER:-20} +export ZERO_STAGE=${ZERO_STAGE:-3} +export DTYPE=${DTYPE:-fp16} +export TP=${TP:-1} +export PP=${PP:-1} +export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} +export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) + + +# bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh $@ + +# Disabling tensor/pipeline parallelism +TP=${TP:-1} +PP=${PP:-1} + +export DATA_PARENT="/home/foremans/polaris/projects/saforem2/Megatron-DeepSpeed" +export DATA_TYPE="BookCorpusDataset_text_document" +# export DATA_PARENT="/lus/eagle/projects/datasets/Megatron-DeepSpeed/GenSLMSubSample200k" +# export DATA_TYPE="genslm_subsample_200k_sequence_document" +export DATA_DIR="${DATA_PARENT}/dataset" +export DATA_PATH="${DATA_DIR}/${DATA_TYPE}" +export VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" +export MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" + + +DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" +bash ./generate_config.sh ${DS_CONFIG} || exit 1 + +OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME} +mkdir -p $OUTPUT_DIR +echo "!!!Please see logs at ${OUTPUT_DIR}" + +# Hostfile path +hostfile_deepspeed=./hostfile_deepspeed +hostfile_mpich=./hostfile_mpich +cat $PBS_NODEFILE > hostfile_mpich +cat $PBS_NODEFILE > hostfile_deepspeed ; sed -e 's/$/ slots=4/' -i hostfile_deepspeed + +ds_args=" " +ds_args=" --deepspeed ${ds_args}" +if [ $PP == 1 ]; then + ds_args=" --no-pipeline-parallel ${ds_args}" +fi +ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" +ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" +# we are now using activation checkpoint provided by megatron, see below. +# ds_args=" --deepspeed-activation-checkpointing ${ds_args}" + +# take custom args +custom_args=" $@" + +# launcher setting +LAUNCHER=${LAUNCHER:-MPICH} +if [[ $LAUNCHER == "deepspeed" ]]; then + launcher="" +else + launcher="--force_multi --hostfile $hostfile_deepspeed --launcher=${LAUNCHER} --launcher_args='-hostfile ${hostfile_mpich}'" +fi + +NCCL=${NCCL:-nccl} + +run_cmd=" + deepspeed $launcher pretrain_gpt.py \ + --tensor-model-parallel-size $TP \ + --pipeline-model-parallel-size $PP \ + --num-layers $NLAYERS \ + --hidden-size $HIDDEN \ + --num-attention-heads $HEADS \ + --seq-length $SEQ \ + --max-position-embeddings $SEQ \ + --micro-batch-size $MICRO_BATCH \ + --global-batch-size $GLOBAL_BATCH \ + --train-iters $TRAIN_ITER \ + --lr 0.00015 \ + --lr-warmup-fraction .01 \ + --lr-decay-iters 320000 \ + --lr-decay-style cosine \ + --log-interval 1 \ + --eval-iters 100 \ + --eval-interval 100 \ + --data-path $DATA_PATH \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --save-interval 500 \ + --split 100,0,0 \ + --$DTYPE \ + --checkpoint-activations \ + --deepspeed-activation-checkpointing + $ds_args \ + --no-masked-softmax-fusion \ + --no-bias-gelu-fusion \ + --no-bias-dropout-fusion \ + --no-gradient-accumulation-fusion \ + --distributed-backend $NCCL \ + --num-workers 0 \ + $custom_args \ + |& tee $OUTPUT_DIR/output.log + " + +echo ${run_cmd} +eval ${run_cmd} +set +x From a5eabe5fbf6ffa99ab03769d9dd81c657fa721ac Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 1 Feb 2024 13:12:11 -0600 Subject: [PATCH 038/268] Add `generate_config.sh` --- generate_config.sh | 133 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 generate_config.sh diff --git a/generate_config.sh b/generate_config.sh new file mode 100644 index 0000000000..210220e192 --- /dev/null +++ b/generate_config.sh @@ -0,0 +1,133 @@ +#!/bin/bash --login + +for v in "$GLOBAL_BATCH" "$MICRO_BATCH" "$GRAD_ACC_STEPS" "$ZERO_STAGE" \ + "$PP" "$DTYPE" +do + if [ -z $v ]; then + echo "Please export required envs before execute $0" + exit 1 + fi +done + +if [ $# -ne 1 ]; then + echo "Usage: $0 config_file" + exit 1 +fi + +extra="" +common="\ + \"train_batch_size\": $GLOBAL_BATCH, + \"train_micro_batch_size_per_gpu\": $MICRO_BATCH, + \"steps_per_print\": 1, + \"gradient_accumulation_steps\": $GRAD_ACC_STEPS, + \"optimizer\": { + \"type\": \"Adam\", + \"params\": { + \"lr\": 0.00015, + \"weight_decay\": 1e-2 + } + }, + \"zero_allow_untested_optimizer\": true, + \"gradient_clipping\": 1.0, + \"activation_checkpointing\": { + \"partition_activations\": true, + \"contiguous_memory_optimization\": false + }, + \"wall_clock_breakdown\": false," + +flops_profiler="\ + \"flops_profiler\": { + \"enabled\": false, + \"profile_step\": 45, + \"module_depth\": -1, + \"top_modules\": 1, + \"detailed\": true, + \"output_file\": null + }" + +if [[ $DTYPE == "bf16" ]]; then +dtype="\ + \"communication_data_type\": \"bfp16\", + \"fp16\": { + \"enabled\": false, + \"loss_scale\": 0, + \"loss_scale_window\": 1000, + \"hysteresis\": 2, + \"min_loss_scale\": 1 + }, + \"bfloat16\": { + \"enabled\": true, + \"loss_scale\": 1.0 + }," +else +dtype="\ + \"communication_data_type\": \"fp16\", + \"fp16\": { + \"enabled\": true, + \"loss_scale\": 0, + \"loss_scale_window\": 1000, + \"hysteresis\": 2, + \"min_loss_scale\": 1 + }, + \"bfloat16\": { + \"enabled\": false, + \"loss_scale\": 1.0 + }," +fi + +if [ $ZERO_STAGE == 3 ]; then +zero="\ + \"zero_optimization\": { + \"stage\": 3, + \"reduce_scatter\": false, + \"stage3_max_live_parameters\": 3e9, + \"stage3_max_reuse_distance\": 3e9, + \"stage3_param_persistence_threshold\": 1e5, + \"stage3_prefetch_bucket_size\": 5e7, + \"contiguous_gradients\": true, + \"overlap_comm\": true, + \"reduce_bucket_size\": 90000000, + \"sub_group_size\": 1e9, + \"offload_optimizer\": { + \"device\": \"none\", + \"buffer_count\": 4, + \"pipeline_read\": false, + \"pipeline_write\": false, + \"pin_memory\": true + } + }," +elif [ $ZERO_STAGE == 2 ] || [ $ZERO_STAGE == 1 ]; then +zero="\ + \"zero_optimization\": { + \"stage\": $ZERO_STAGE + }," + if [ $ZERO_STAGE == 1 ]; then + if [ $PP > 1 ]; then + extra="\ + \"data_types\": { + \"grad_accum_dtype\": \"fp32\" + }, + \"comms_logger\": { + \"enabled\": true, + \"verbose\": false, + \"prof_all\": true, + \"debug\": false + }," + else + echo 'please add the config for zero_stage 1 without pipeline-parallelism' + fi + fi +else + echo 'Please add the correct config set!!!' +fi + +# flops_profiler must at the end because no ',' is allowed at the end +cat < $1 +{ +$common +$zero +$dtype +$extra +$flops_profiler +} +EOT From 3baad6fddedc32337b5f9d4065f2cc2b3625ac1f Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 1 Feb 2024 13:25:07 -0600 Subject: [PATCH 039/268] Add `pretrain_gpt_alcf.py` --- pretrain_gpt_alcf.py | 387 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 387 insertions(+) create mode 100644 pretrain_gpt_alcf.py diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py new file mode 100644 index 0000000000..708a3b7864 --- /dev/null +++ b/pretrain_gpt_alcf.py @@ -0,0 +1,387 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Pretrain GPT""" + +import torch +import math +from functools import partial +from megatron import get_args +from megatron import print_rank_0 +from megatron import get_timers +from megatron import get_tokenizer +from megatron.core import mpu, tensor_parallel +from megatron.core.enums import ModelType +from megatron.data.gpt_dataset import build_train_valid_test_datasets +from megatron.model import GPTModel, GPTModelPipe +from megatron.training import pretrain +from megatron.utils import get_ltor_masks_and_position_ids +from megatron.utils import average_losses_across_data_parallel_group, update_rotary_pos_emb +from megatron.arguments import core_transformer_config_from_args + +import deepspeed +from deepspeed.runtime.utils import see_memory_usage +from deepspeed.accelerator.real_accelerator import get_accelerator +import os +import subprocess + +from torch import nn +import torch.nn.functional as F + +# from ezpz.configs import get_logging_config +from ezpz.dist import get_rank, get_world_size, setup_wandb # , setup_torch + +RANK = get_rank() +WORLD_SIZE = get_world_size() + +if RANK == 0 and not ( + (wbmode := os.environ.get('WANDB_MODE')) + and (wbmode is not None and str(wbmode).lower() == 'disabled') +): + project_name = ( + os.environ.get( + 'WB_PROJECT', + os.environ.get( + 'WANDB_PROJECT', + 'GenSLM-Megatron-DS' + ), + ) + ) + print('--------------------------------------------------') + print(f"Setting up W&B from: {RANK} with {project_name}") + print('--------------------------------------------------') + setup_wandb(project_name=project_name) + + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + print_rank_0('building GPT model ...') + see_memory_usage(f"Before Building Model", force=True) + + args = get_args() + config = core_transformer_config_from_args(args) + if hasattr(mpu, 'get_sequence_parallel_group'): + dpg = mpu.get_sequence_parallel_group() + elif hasattr(mpu, 'get_data_parallel_group'): + dpg = mpu.get_data_parallel_group() + else: + dpg = None + with deepspeed.zero.Init(data_parallel_group=dpg, + remote_device=None if args.remote_device == 'none' else args.remote_device, + config_dict_or_path=args.deepspeed_config_dict, + enabled=args.zero_stage == 3, + mpu=mpu): + if args.deepspeed and not args.no_pipeline_parallel: + model = GPTModelPipe( + config=config, + num_tokentypes=0, + parallel_output=True + ) + # This is a hack to give us a reference to get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + + # Predompute the attention mask and store it in args. This avoids having to + # pipeline it as an activation during training. The mask is constant, and thus + # we can reuse it. + attention_mask = torch.tril(torch.ones( + (1, args.seq_length, args.seq_length), device=get_accelerator().current_device_name())).view( + 1, 1, args.seq_length, args.seq_length) + + # Convert attention mask to binary: + attention_mask = (attention_mask < 0.5) + if args.fp16: + attention_mask = attention_mask.half() + elif args.bf16: + attention_mask = attention_mask.bfloat16() + + # Attention mask must be bool. + args.attn_mask = attention_mask.to(torch.bool) + + # For prertaining, since sequence length is fixed, cache rotary embedding in args, to avoid communicating around + if args.use_rotary_position_embeddings: + update_rotary_pos_emb(args.seq_length) + + else: + model = GPTModel( + config=config, + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process + ) + see_memory_usage(f"After Building Model", force=True) + return model + + +def get_batch(data_iterator): + """Generate a batch""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = ['text'] + datatype = torch.int64 + + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Get the masks and postition ids. + skip_mask = args.use_flash_attn or args.use_flash_attn_triton + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + skip_mask) + + # For DS's sequence parallel + seq_parallel_world_size = mpu.get_sequence_parallel_world_size() + seq_parallel_world_rank = mpu.get_sequence_parallel_rank() + + # For Megatron's sequence parallel + if args.sequence_parallel: + seq_parallel_world_size = mpu.get_tensor_model_parallel_world_size() + seq_parallel_world_rank = mpu.get_tensor_model_parallel_rank() + seq_length = tokens.size(1) + + assert seq_length % seq_parallel_world_size == 0 + sub_seq_length = seq_length // seq_parallel_world_size + sub_seq_start = seq_parallel_world_rank * sub_seq_length + sub_seq_end = (seq_parallel_world_rank + 1) * sub_seq_length + + tokens = tokens[:, sub_seq_start:sub_seq_end] + position_ids = position_ids[:, sub_seq_start:sub_seq_end] + # For DS's sequence parallel + if mpu.get_sequence_parallel_world_size() > 1: + labels = labels[:, sub_seq_start:sub_seq_end] + + return tokens, labels, loss_mask, attention_mask, position_ids + +def data_post_process(data, data_sampler_state_dict): + args = get_args() + if args.data_efficiency_curriculum_learning: + if 'seqlen_truncate' in data_sampler_state_dict['current_difficulties']: + args.data_efficiency_curriculum_learning_seqlen_type = 'seqlen_truncate' + current_seqlen = data_sampler_state_dict['current_difficulties']['seqlen_truncate'] + if current_seqlen < args.seq_length: + data['text'] = data['text'][:, :(current_seqlen+1)].contiguous() + elif 'seqlen_reshape' in data_sampler_state_dict['current_difficulties']: + args.data_efficiency_curriculum_learning_seqlen_type = 'seqlen_reshape' + current_seqlen = data_sampler_state_dict['current_difficulties']['seqlen_reshape'] + if current_seqlen < args.seq_length: + orig_num_token = torch.numel(data['text']) + reshape_len = (data['text'].size()[1] // (current_seqlen+1)) * (current_seqlen+1) + data['text'] = torch.cat((data['text'][:, :reshape_len].contiguous().view(-1, current_seqlen+1), + data['text'][:, -(current_seqlen+1):]), 0).contiguous() + num_row = math.ceil(orig_num_token / (current_seqlen+1)) + num_row = min(num_row, data['text'].size()[0]) + if num_row > 1 and num_row % 2 != 0: + num_row -= 1 + data['text'] = data['text'][:num_row, :].contiguous() + else: + args.data_efficiency_curriculum_learning_seqlen_type = None + return data + +def get_batch_pipe(data): + """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = ['text'] + datatype = torch.int64 + + # Broadcast data. + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss) + if args.curriculum_learning_legacy and args.curriculum_seqlen < tokens.size()[1]: + # seqlen-based curriculum learning + # tokens, position_ids, labels, loss_mask have size [batch size, seqlen] + tokens = tokens[:, :args.curriculum_seqlen].contiguous() + position_ids = position_ids[:, :args.curriculum_seqlen].contiguous() + if labels is not None: + labels = labels[:, :args.curriculum_seqlen].contiguous() + loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous() + + return (tokens, position_ids, attention_mask), (labels, loss_mask) + + +def loss_func(loss_mask, moe_loss, mos_loss, output_tensor): + args = get_args() + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + if args.mos or args.kd: + # assert max(args.num_experts) >= 1 + loss = loss + moe_loss + mos_loss + if args.mos: + return loss, {'total loss': loss, 'lm loss': averaged_loss[0], 'moe loss': moe_loss, 'mos loss': mos_loss} + elif args.kd: + return loss, {'total loss': loss, 'lm loss': averaged_loss[0], 'moe loss': moe_loss, 'kd loss': mos_loss} + print_rank_0('>>> total loss: {}, lm loss {}, kd loss {}'.format(loss, averaged_loss[0], mos_loss)) + else: + if max(args.num_experts) <= 1: + return loss, {'lm loss': averaged_loss[0]} + else: + loss = loss + moe_loss + return loss, {'lm loss': averaged_loss[0], 'moe loss': moe_loss} + +def calculate_mos_loss(args, stu_output, teacher_model, tokens, position_ids, attention_mask): + mos_loss = 0 + alpha = args.kd_alpha_ce + beta = args.kd_beta_ce + kd_temp = args.kd_temp + + if teacher_model: + with torch.no_grad(): + if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length: + assert args.curriculum_seqlen is not None + curriculum_seqlen = args.curriculum_seqlen + tokens = tokens[:, :curriculum_seqlen].contiguous() + position_ids = position_ids[:, :curriculum_seqlen].contiguous() + attention_mask = attention_mask[:, :, :curriculum_seqlen, :curriculum_seqlen].contiguous() + # No need to truncate labels as we do not need it for the teacher logits + tea_output, tea_other_losses = teacher_model(tokens, position_ids, attention_mask) + assert stu_output.size() == tea_output.size(), 'teacher and student output should match in size. Student: {}, Teacher: {}, CL seq length {}'.format(stu_output.size(), tea_output.size(), args.curriculum_seqlen) + + student_logits = F.log_softmax(stu_output / kd_temp, dim=2) + tea_logits = F.softmax(tea_output / kd_temp, dim=2) # The target logits is expected to be probabilities. If we use log_softmax, then we need to set target_log to true when initializing the KLDivLoss. + + mos_loss = kd_temp * kd_temp * nn.KLDivLoss(reduction='batchmean')(student_logits, tea_logits) + + mos_loss = mos_loss.div(args.seq_length) * beta + return mos_loss + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator', log_level=2).start() + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator) + timers('batch-generator').stop() + + if args.data_efficiency_curriculum_learning: + args.curriculum_seqlen = tokens.size()[1] + if hasattr(args, 'data_efficiency_curriculum_learning_seqlen_type') and \ + args.data_efficiency_curriculum_learning_seqlen_type == 'seqlen_reshape': + args.data_efficiency_curriculum_learning_numel = torch.numel(tokens) + + if args.mos or args.kd: + # The forward func can return either the loss or the logits, depending on whether passing in the labels or not. + stu_output, other_losses = model(tokens, position_ids, attention_mask) + if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length: + assert args.curriculum_seqlen is not None + labels = labels[:, :args.curriculum_seqlen].contiguous() + output_tensor = tensor_parallel.vocab_parallel_cross_entropy(stu_output.contiguous().float(), labels) + else: + output_tensor, other_losses = model(tokens, position_ids, attention_mask, + labels=labels) + if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length: + loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous() + + moe_losses = [] + for moe_loss in other_losses: + if moe_loss is not None: + moe_losses.append(moe_loss) + moe_loss = sum(moe_losses) * args.moe_loss_coeff + + mos_loss = 0 + if args.mos or args.kd: + assert model.training + if args.teacher_forward and args.teacher_model is not None: + mos_loss = calculate_mos_loss(args, stu_output, + args.teacher_model[0], tokens, position_ids, attention_mask) + + # Output_tensor stores the standard loss, loos_func calculates the total loss. + return output_tensor, partial(loss_func, loss_mask, moe_loss, mos_loss) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + + print_rank_0('> building train, validation, and test datasets ' + 'for GPT ...') + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length, + seed=args.seed, + skip_warmup=(not args.mmap_warmup), + train_data_prefix=args.train_data_path, + valid_data_prefix=args.valid_data_path, + test_data_prefix=args.test_data_path, + data_cache_path=args.data_cache_path) + print_rank_0("> finished creating GPT datasets ...") + + return train_ds, valid_ds, test_ds + + +def command_exists(cmd): + result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) + return result.wait() == 0 + + +def git_ds_info(): + if RANK != 0: + return + from deepspeed.env_report import main as ds_report + ds_report() + + # Write out version/git info + git_hash_cmd = "git rev-parse --short HEAD" + git_branch_cmd = "git rev-parse --abbrev-ref HEAD" + if command_exists('git'): + try: + result = subprocess.check_output(git_hash_cmd, shell=True) + git_hash = result.decode('utf-8').strip() + result = subprocess.check_output(git_branch_cmd, shell=True) + git_branch = result.decode('utf-8').strip() + except subprocess.CalledProcessError: + git_hash = "unknown" + git_branch = "unknown" + else: + git_hash = "unknown" + git_branch = "unknown" + print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') + + +if __name__ == "__main__": + git_ds_info() + pretrain(train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + data_post_process=data_post_process) From f51f1fa15126ee0e7599b265ce3de274cf17e5b6 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 1 Feb 2024 13:31:56 -0600 Subject: [PATCH 040/268] Rename `pretrain_gpt.py` -> `pretrain_gpt_alcf.py` in `ALCF/*.sh` --- ALCF/README.md | 5 ++++- ALCF/launch.sh | 3 ++- ALCF/submit.sh | 2 +- ALCF/train-gpt3.sh | 2 +- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/ALCF/README.md b/ALCF/README.md index 89fb639da8..e8f4127876 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -57,7 +57,7 @@ strategies and various optimizations that are supported. # - other args: defined in ALCF/args.sh # --------------------------------------------- MODEL_SIZE_KEY="GPT25B" \ - SEQ_LEN=4096 \ + SEQ_LEN=4096 \ USE_FLASH_ATTN_V2=1 \ MICRO_BATCH=1 \ GAS=1 \ @@ -69,6 +69,7 @@ strategies and various optimizations that are supported. ## Helper Scripts +- [`pretrain_gpt_alcf.py`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/pretrain_gpt_alcf.py) - 📂 [`ALCF/`](https://github.com/argonne-lcf/Megatron-DeepSpeed/tree/main/ALCF) `├──` [`args.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/models.sh) `├──` [`launch.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/launch.sh) @@ -80,6 +81,8 @@ strategies and various optimizations that are supported.
+
pretrain_gpt_alcf.py +
Python module to be launched. Running `./ALCF/train-gpt3.sh` will automaticall build an `mpiexec` command and launch this module.
ALCF/train-gpt3.sh
Main entry point for training. This script will automatically source the rest of the required ALCF/*.sh scripts below
ALCF/model.sh
diff --git a/ALCF/launch.sh b/ALCF/launch.sh index 41a620a145..2dd834f568 100755 --- a/ALCF/launch.sh +++ b/ALCF/launch.sh @@ -39,7 +39,8 @@ MPI_WRAPPER="${SCRIPT_DIR}/mpi_wrapper" # sourceFile "${ALCF_DIR}/args.sh" -MAIN="${PARENT}/pretrain_${MODEL_TYPE}.py" +# MAIN="${PARENT}/pretrain_${MODEL_TYPE}.py" +MAIN="${PARENT}/pretrain_gpt_alcf.py" printJobInfo() { echo "Job started at: ${TSTAMP} on $(hostname)" diff --git a/ALCF/submit.sh b/ALCF/submit.sh index cc308b5be5..6842eb66b6 100755 --- a/ALCF/submit.sh +++ b/ALCF/submit.sh @@ -22,7 +22,7 @@ echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" #┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ #┃ Make sure we're not already running; if so, exit here ┃ #┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ -PIDS=$(ps aux | grep pretrain_gpt.py | grep -v grep | awk '{print $2}') +PIDS=$(ps aux | grep pretrain_gpt_alcf.py | grep -v grep | awk '{print $2}') if [ -n "${PIDS}" ]; then echo "Already running! Exiting!" exit 1 diff --git a/ALCF/train-gpt3.sh b/ALCF/train-gpt3.sh index bb054b0386..79e2661d16 100755 --- a/ALCF/train-gpt3.sh +++ b/ALCF/train-gpt3.sh @@ -31,7 +31,7 @@ echo "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+" #┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ #┃ Make sure we're not already running; if so, exit here ┃ #┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ -PIDS=$(ps aux | egrep "$USER.+mpi.+pretrain_gpt.py" | grep -v grep | awk '{print $2}') +PIDS=$(ps aux | grep -E "$USER.+mpi.+pretrain_gpt_alcf.py" | grep -v grep | awk '{print $2}') if [ -n "${PIDS}" ]; then echo "Already running! Exiting!" exit 1 From 7c8029af74401d17b964e11c34644c443bb2aeed Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 1 Feb 2024 13:32:41 -0600 Subject: [PATCH 041/268] Update `set_params.sh` --- set_params.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/set_params.sh b/set_params.sh index 40682a9fe3..8d6cd1ce91 100644 --- a/set_params.sh +++ b/set_params.sh @@ -21,10 +21,10 @@ export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP TP=${TP:-1} PP=${PP:-1} -export DATA_PARENT="/home/foremans/polaris/projects/saforem2/Megatron-DeepSpeed" -export DATA_TYPE="BookCorpusDataset_text_document" -# export DATA_PARENT="/lus/eagle/projects/datasets/Megatron-DeepSpeed/GenSLMSubSample200k" -# export DATA_TYPE="genslm_subsample_200k_sequence_document" +# export DATA_PARENT="/home/foremans/polaris/projects/saforem2/Megatron-DeepSpeed" +# export DATA_TYPE="BookCorpusDataset_text_document" +export DATA_PARENT="/lus/eagle/projects/datasets/Megatron-DeepSpeed/GenSLMSubSample200k" +export DATA_TYPE="genslm_subsample_200k_sequence_document" export DATA_DIR="${DATA_PARENT}/dataset" export DATA_PATH="${DATA_DIR}/${DATA_TYPE}" export VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" @@ -41,7 +41,7 @@ echo "!!!Please see logs at ${OUTPUT_DIR}" # Hostfile path hostfile_deepspeed=./hostfile_deepspeed hostfile_mpich=./hostfile_mpich -cat $PBS_NODEFILE > hostfile_mpich +cat $PBS_NODEFILE > hostfile_mpich cat $PBS_NODEFILE > hostfile_deepspeed ; sed -e 's/$/ slots=4/' -i hostfile_deepspeed ds_args=" " From 1152aad4ceafed167cddafc4df7fd2e2f7fc4aca Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 1 Feb 2024 13:40:07 -0600 Subject: [PATCH 042/268] Update `set_params.sh` --- set_params.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/set_params.sh b/set_params.sh index 8d6cd1ce91..b6bd36038d 100644 --- a/set_params.sh +++ b/set_params.sh @@ -68,7 +68,7 @@ fi NCCL=${NCCL:-nccl} run_cmd=" - deepspeed $launcher pretrain_gpt.py \ + deepspeed $launcher pretrain_gpt_alcf.py \ --tensor-model-parallel-size $TP \ --pipeline-model-parallel-size $PP \ --num-layers $NLAYERS \ From 91cf7b1b8746e564b072cbf975301d172c8a6291 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 2 Feb 2024 10:20:30 -0600 Subject: [PATCH 043/268] added pretrain_gpt_alcf.py --- llama.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.sh b/llama.sh index 323a5f4730..b62fe794ef 100755 --- a/llama.sh +++ b/llama.sh @@ -1,4 +1,4 @@ -#!/bin/bashOA +#!/bin/bash #PBS -l walltime=0:30:00 #PBS -A datascience #PBS -q debug-scaling From e1f9e300531cf747454f6a26af6539818fd73f12 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 2 Feb 2024 10:20:46 -0600 Subject: [PATCH 044/268] added pretrain_gpt_alcf.py --- pretrain_gpt_alcf.py | 593 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 593 insertions(+) create mode 100644 pretrain_gpt_alcf.py diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py new file mode 100644 index 0000000000..0c82e0ff0e --- /dev/null +++ b/pretrain_gpt_alcf.py @@ -0,0 +1,593 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Pretrain GPT""" + +import os +import torch +import math +# import logging + +from functools import partial +from megatron import get_args +from megatron import print_rank_0 +from rich import print +from megatron import get_timers +from megatron import get_tokenizer +from megatron.core import mpu, tensor_parallel +from megatron.core.enums import ModelType +from megatron.data.gpt_dataset import build_train_valid_test_datasets +from megatron.model import GPTModel, GPTModelPipe +from megatron.training import pretrain +from megatron.utils import get_ltor_masks_and_position_ids +from megatron.utils import average_losses_across_data_parallel_group, update_rotary_pos_emb +from megatron.arguments import core_transformer_config_from_args +from megatron.utils import ( + report_memory, + throughput_calculator, + checkpoint_throughput_calculator +) +from pathlib import Path + +import deepspeed +from deepspeed.runtime.utils import see_memory_usage +from deepspeed.accelerator.real_accelerator import get_accelerator +import subprocess +import wandb + +import time +from torch import nn +import torch.nn.functional as F + +# from ezpz import get_logger +from ezpz.dist import setup_torch, get_world_size, setup_wandb + +RANK = setup_torch( + backend='deepspeed', + port='5432', +) +WORLD_SIZE = get_world_size() +LEVEL = "DEBUG" if RANK == 0 else "CRITICAL" + +WANDB_MODE = os.environ.get('WANDB_MODE', None) +DISABLE_WANDB = WANDB_MODE is not None and str(WANDB_MODE).lower() == 'disabled' + +if RANK == 0 and not DISABLE_WANDB: + project_name = ( + os.environ.get( + 'WB_PROJECT', + os.environ.get( + 'WANDB_PROJECT', + 'GenSLM-Megatron-DS' + ), + ) + ) + print('--------------------------------------------------') + print(f"Setting up W&B from: {RANK} with {project_name}") + print('--------------------------------------------------') + setup_wandb(project_name=project_name) + + +# os.environ[''] +# wblogger = logging.getLogger("wandb") +# wblogger.setLevel(logging.DEBUG) + +# log = get_logger(__name__, level=LEVEL) +# +# log.critical(f"Hello from rank: {RANK} / {WORLD_SIZE} !") + +import socket +from typing import Optional +# log.critical(f"Setting up W&B from rank: {RANK} with {wb_project_name}") + + +# def setup_wandb(project_name: Optional[str] = None): +# print(f"Setting up W&B from: {RANK}") +# project_name = ( +# os.environ.get('WB_PROJECT', 'GenSLM-Megatron-DS') +# if project_name is None else project_name +# ) +# print(f"Setting up wandb from rank: {RANK}") +# print(f"Using: WB PROJECT: {project_name}") +# # if get_rank() == 0: +# # tensorboard_dir = args.tensorboard_dir +# tensorboard_dir = None +# # if config is None: +# tensorboard_dir = os.environ.get('TENSORBOARD_DIR', None) +# # else: +# # tensorboard_dir = ( +# # config.get( +# # 'tensorboard_dir', +# # None, # os.getcwd() +# # ) +# # ) +# if tensorboard_dir is not None: +# print(f'Patching tensorboard from {tensorboard_dir}') +# wandb.tensorboard.patch(root_logdir=tensorboard_dir) +# # wbrun_id = wandb.util.generate_id() +# current_time = time.time() +# # local_time = time.localtime(current_time) +# # if wandb.run is None: +# wandb.init( +# resume='allow', +# sync_tensorboard=(tensorboard_dir is not None), # True, +# project=(project_name if project_name is not None else None), +# # dir=(tensorboard_dir if tensorboard_dir is not None else None), +# ) +# assert wandb.run is not None +# print(f"W&B RUN: [{wandb.run.name}]({wandb.run.url})") +# wandb.run.config.update({'current_time': current_time}) +# model_size = os.environ.get('MODEL_SIZE', None) +# wandb.run.config.update({'world_size': get_world_size()}) +# # if config is not None: +# # wandb.run.config.update(config) +# env = { +# k: v for k, v in dict(os.environ).items() +# if not k.startswith('_ModuleTable') +# } +# _ = env.pop('LS_COLORS', None) +# _ = env.pop('PS1', None) +# wandb.run.config.update({'env': env}) +# hostname = socket.gethostbyaddr(socket.gethostname())[0] +# if hostname.startswith('theta'): +# wandb.run.config.update({'machine': 'ThetaGPU'}) +# elif hostname.startswith('x3'): +# wandb.run.config.update({'machine': 'Polaris'}) +# elif hostname.startswith('x1'): +# wandb.run.config.update({'machine': 'Sunspot'}) +# elif hostname.startswith('nid'): +# wandb.run.config.update({'machine': 'Perlmutter'}) +# elif hostname.startswith('login'): +# wandb.run.config.update({'machine': 'NERSC'}) +# else: +# wandb.run.config.update({'machine': hostname}) +# if model_size is not None: +# wandb.run.config.update({'MODEL_SIZE': model_size}) + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + print_rank_0('building GPT model ...') + see_memory_usage("Before Building Model", force=True) + args = get_args() + config = core_transformer_config_from_args(args) + # args = get_args() + # timers = get_timers() + if wandb.run is not None: + print(f"Updating WandB run: [{wandb.run.name}]({wandb.run.url})") + wandb.run.config.update({"args": vars(args)}) + if RANK == 0: + git_ds_info() + + with deepspeed.zero.Init(sequence_data_parallel_group=mpu.get_sequence_data_parallel_group(), + remote_device=None if args.remote_device == 'none' else args.remote_device, + config_dict_or_path=args.deepspeed_config_dict, + enabled=args.zero_stage == 3, + mpu=mpu): + if args.deepspeed and not args.no_pipeline_parallel: + model = GPTModelPipe( + config=config, + num_tokentypes=0, + parallel_output=True + ) + # This is a hack to give us a reference to get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + + # Predompute the attention mask and store it in args. This avoids having to + # pipeline it as an activation during training. The mask is constant, and thus + # we can reuse it. + attention_mask = torch.tril(torch.ones( + (1, args.seq_length, args.seq_length), device=get_accelerator().current_device_name())).view( + 1, 1, args.seq_length, args.seq_length) + + # Convert attention mask to binary: + attention_mask = (attention_mask < 0.5) + if args.fp16: + attention_mask = attention_mask.half() + elif args.bf16: + attention_mask = attention_mask.bfloat16() + + # Attention mask must be bool. + args.attn_mask = attention_mask.to(torch.bool) + + # For prertaining, since sequence length is fixed, cache rotary embedding in args, to avoid communicating around + if args.use_rotary_position_embeddings: + update_rotary_pos_emb(args.seq_length) + + else: + model = GPTModel( + config=config, + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process + ) + num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + # print_rank_0('\n ------------------------ ') + # print_rank_0(f'num of parameters {num_params}') + # print_rank_0('------------------------\n ') + print_rank_0(80 * '-') + print_rank_0(f"Number of parameters in model: {num_params}") + print_rank_0(80 * '-') + see_memory_usage("After Building Model", force=True) + if wandb.run is not None: + wandb.run.watch( + model, + log='all', + log_graph=True, + ) + wandb.run.config.update({'num_params': num_params}) + return model + +def get_batch(data_iterator): + """Generate a batch""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = ['text'] + datatype = torch.int64 + + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Get the masks and postition ids. + skip_mask = hasattr(args, 'use_flash_attn') or hasattr(args, 'flash_attn_triton') + # skip_mask = args.use_flash_attn or args.use_flash_attn_triton + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + skip_mask) + + # For DS's sequence parallel + seq_parallel_world_size = mpu.get_sequence_parallel_world_size() + seq_parallel_world_rank = mpu.get_sequence_parallel_rank() + + # For Megatron's sequence parallel + if args.sequence_parallel: + seq_parallel_world_size = mpu.get_tensor_model_parallel_world_size() + seq_parallel_world_rank = mpu.get_tensor_model_parallel_rank() + seq_length = tokens.size(1) + + assert seq_length % seq_parallel_world_size == 0 + sub_seq_length = seq_length // seq_parallel_world_size + sub_seq_start = seq_parallel_world_rank * sub_seq_length + sub_seq_end = (seq_parallel_world_rank + 1) * sub_seq_length + + tokens = tokens[:, sub_seq_start:sub_seq_end] + position_ids = position_ids[:, sub_seq_start:sub_seq_end] + # For DS's sequence parallel + if mpu.get_sequence_parallel_world_size() > 1: + labels = labels[:, sub_seq_start:sub_seq_end] + + return tokens, labels, loss_mask, attention_mask, position_ids + + +def data_post_process(data, data_sampler_state_dict): + args = get_args() + if args.data_efficiency_curriculum_learning: + if 'seqlen_truncate' in data_sampler_state_dict['current_difficulties']: + args.data_efficiency_curriculum_learning_seqlen_type = 'seqlen_truncate' + current_seqlen = data_sampler_state_dict['current_difficulties']['seqlen_truncate'] + if current_seqlen < args.seq_length: + data['text'] = data['text'][:, :(current_seqlen+1)].contiguous() + elif 'seqlen_reshape' in data_sampler_state_dict['current_difficulties']: + args.data_efficiency_curriculum_learning_seqlen_type = 'seqlen_reshape' + current_seqlen = data_sampler_state_dict['current_difficulties']['seqlen_reshape'] + if current_seqlen < args.seq_length: + orig_num_token = torch.numel(data['text']) + reshape_len = (data['text'].size()[1] // (current_seqlen+1)) * (current_seqlen+1) + data['text'] = torch.cat((data['text'][:, :reshape_len].contiguous().view(-1, current_seqlen+1), + data['text'][:, -(current_seqlen+1):]), 0).contiguous() + num_row = math.ceil(orig_num_token / (current_seqlen+1)) + num_row = min(num_row, data['text'].size()[0]) + if num_row > 1 and num_row % 2 != 0: + num_row -= 1 + data['text'] = data['text'][:num_row, :].contiguous() + else: + args.data_efficiency_curriculum_learning_seqlen_type = None + return data + + +def get_batch_pipe(data): + """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = ['text'] + datatype = torch.int64 + + # Broadcast data. + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss) + if args.curriculum_learning_legacy and args.curriculum_seqlen < tokens.size()[1]: + # seqlen-based curriculum learning + # tokens, position_ids, labels, loss_mask have size [batch size, seqlen] + tokens = tokens[:, :args.curriculum_seqlen].contiguous() + position_ids = position_ids[:, :args.curriculum_seqlen].contiguous() + if labels is not None: + labels = labels[:, :args.curriculum_seqlen].contiguous() + loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous() + + return (tokens, position_ids, attention_mask), (labels, loss_mask) + + +def loss_func(loss_mask, moe_loss, mos_loss, output_tensor): + args = get_args() + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + if args.mos or args.kd: + # assert max(args.num_experts) >= 1 + loss = loss + moe_loss + mos_loss + if args.mos: + return loss, {'total loss': loss, 'lm loss': averaged_loss[0], 'moe loss': moe_loss, 'mos loss': mos_loss} + elif args.kd: + return loss, {'total loss': loss, 'lm loss': averaged_loss[0], 'moe loss': moe_loss, 'kd loss': mos_loss} + print_rank_0('>>> total loss: {}, lm loss {}, kd loss {}'.format(loss, averaged_loss[0], mos_loss)) + else: + if max(args.num_experts) <= 1: + return loss, {'lm loss': averaged_loss[0]} + else: + loss = loss + moe_loss + return loss, {'lm loss': averaged_loss[0], 'moe loss': moe_loss} + +def calculate_mos_loss(args, stu_output, teacher_model, tokens, position_ids, attention_mask): + mos_loss = 0 + alpha = args.kd_alpha_ce + beta = args.kd_beta_ce + kd_temp = args.kd_temp + + if teacher_model: + with torch.no_grad(): + if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length: + assert args.curriculum_seqlen is not None + curriculum_seqlen = args.curriculum_seqlen + tokens = tokens[:, :curriculum_seqlen].contiguous() + position_ids = position_ids[:, :curriculum_seqlen].contiguous() + attention_mask = attention_mask[:, :, :curriculum_seqlen, :curriculum_seqlen].contiguous() + # No need to truncate labels as we do not need it for the teacher logits + tea_output, tea_other_losses = teacher_model(tokens, position_ids, attention_mask) + assert stu_output.size() == tea_output.size(), 'teacher and student output should match in size. Student: {}, Teacher: {}, CL seq length {}'.format(stu_output.size(), tea_output.size(), args.curriculum_seqlen) + + student_logits = F.log_softmax(stu_output / kd_temp, dim=2) + tea_logits = F.softmax(tea_output / kd_temp, dim=2) # The target logits is expected to be probabilities. If we use log_softmax, then we need to set target_log to true when initializing the KLDivLoss. + + mos_loss = kd_temp * kd_temp * nn.KLDivLoss(reduction='batchmean')(student_logits, tea_logits) + + mos_loss = mos_loss.div(args.seq_length) * beta + return mos_loss + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator', log_level=2).start() + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator) + timers('batch-generator').stop() + + if args.data_efficiency_curriculum_learning: + args.curriculum_seqlen = tokens.size()[1] + if hasattr(args, 'data_efficiency_curriculum_learning_seqlen_type') and \ + args.data_efficiency_curriculum_learning_seqlen_type == 'seqlen_reshape': + args.data_efficiency_curriculum_learning_numel = torch.numel(tokens) + + if args.mos or args.kd: + # The forward func can return either the loss or the logits, depending on whether passing in the labels or not. + stu_output, other_losses = model(tokens, position_ids, attention_mask) + if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length: + assert args.curriculum_seqlen is not None + labels = labels[:, :args.curriculum_seqlen].contiguous() + output_tensor = tensor_parallel.vocab_parallel_cross_entropy(stu_output.contiguous().float(), labels) + else: + output_tensor, other_losses = model(tokens, position_ids, attention_mask, + labels=labels) + if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length: + loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous() + + moe_losses = [] + for moe_loss in other_losses: + if moe_loss is not None: + moe_losses.append(moe_loss) + moe_loss = sum(moe_losses) * args.moe_loss_coeff + + mos_loss = 0 + if args.mos or args.kd: + assert model.training + if args.teacher_forward and args.teacher_model is not None: + mos_loss = calculate_mos_loss(args, stu_output, + args.teacher_model[0], tokens, position_ids, attention_mask) + + # Output_tensor stores the standard loss, loos_func calculates the total loss. + return output_tensor, partial(loss_func, loss_mask, moe_loss, mos_loss) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + + print_rank_0('> building train, validation, and test datasets ' + 'for GPT ...') + files = [] + if args.data_file_list is not None: + with open(args.data_file_list, 'r') as flist: + for f in flist.readlines(): + w, fname = f.split() + files.append(float(w)) + files.append(fname) + elif len(args.data_path)==1 and os.path.isdir(args.data_path[0]): + path=args.data_path[0] + "/" + for f in os.listdir(path): + if (os.path.isfile(path + f) and f.find(".bin")!=-1): + files.append(1) + files.append(path + f.split(".bin")[0]) + else: + files = args.data_path + print_rank_0(f"file list {files}") + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=files, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length, + seed=args.seed, + skip_warmup=(not args.mmap_warmup), + train_data_prefix=args.train_data_path, + valid_data_prefix=args.valid_data_path, + test_data_prefix=args.test_data_path, + data_cache_path=args.data_cache_path) + print_rank_0("> finished creating GPT datasets ...") + + return train_ds, valid_ds, test_ds + + +def command_exists(cmd): + result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) + return result.wait() == 0 + + +def git_ds_info(): + from deepspeed.env_report import main as ds_report + ds_report() + + # Write out version/git info + git_hash_cmd = "git rev-parse --short HEAD" + git_branch_cmd = "git rev-parse --abbrev-ref HEAD" + if command_exists('git'): + try: + result = subprocess.check_output(git_hash_cmd, shell=True) + git_hash = result.decode('utf-8').strip() + result = subprocess.check_output(git_branch_cmd, shell=True) + git_branch = result.decode('utf-8').strip() + except subprocess.CalledProcessError: + git_hash = "unknown" + git_branch = "unknown" + else: + git_hash = "unknown" + git_branch = "unknown" + print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') + + +def main(): + # if RANK == 0: + # setup_wandb() + from torch.profiler import profile, record_function, ProfilerActivity + with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: + model = pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + data_post_process=data_post_process + ) + args = get_args() + prof.export_chrome_trace(f"{args.tensorboard_dir}/torch-trace-{RANK}-of-{WORLD_SIZE}.json") + + # # from megatron.training import get_model + # if wandb.run is not None: + # args = get_args() + # timers = get_timers() + # # model = get_model(model_provider, ModelType.encoder_or_decoder) + # elapsed_time = timers('interval-time').elapsed(barrier=True) + # total_iterations = os.environ.get( + # "TOTAL_ITERATIONS", + # (args.train_iters + args.eval_iters) + # ) + # seq_len = args.seq_length + # elapsed_time_per_iteration = elapsed_time / total_iterations + # if model is not None: + # samples_per_sec, tflops, approx_params_in_billions = throughput_calculator( + # model, + # args, + # elapsed_time, + # total_iterations, + # ) + # # Compute throughput. + # samples_per_sec_per_replica = samples_per_sec / args.data_parallel_size + # tokens_per_sec = samples_per_sec * seq_len + # tokens_per_sec_per_replica = tokens_per_sec / args.data_parallel_size + # sample_consumption_rate = args.consumed_train_samples / elapsed_time + # token_consumption_rate = args.consumed_train_tokens / elapsed_time + # # Tensorboard values. + # tdata = { + # # 'iteration': iteration, + # 'consumed_train_samples': args.consumed_train_samples, + # 'consumed_train_tokens': args.consumed_train_tokens, + # # 'learning_rate': learning_rate, + # # 'batch_size': batch_size, + # # 'loss_scale': loss_scale, + # # 'grad_norm': grad_norm, + # } + # # for key in loss_dict: + # # tdata[f'lm-loss/{key}'] = loss_dict[key] + # + # tdata = {f'train/{k}': v for k, v in tdata.items()} + # # if wbrun is not None and wbrun is wandb.run: + # if wandb.run is not None: + # wandb.run.log(tdata, commit=False) + # tput = { + # 'throughput/iteration-time': elapsed_time_per_iteration, # 1000 ms / s + # 'throughput/samples_per_sec': samples_per_sec, + # 'throughput/samples_per_sec_per_replica': samples_per_sec_per_replica, + # 'throughput/tokens_per_sec': tokens_per_sec, + # 'throughput/tokens_per_sec_per_replica': tokens_per_sec_per_replica, + # 'throughput/tflops': tflops, + # 'throughput/approx_params_in_billions': approx_params_in_billions, + # 'throughput/sample_consumption_rate': sample_consumption_rate, + # 'throughput/token_consumption_rate': token_consumption_rate, + # 'throughput/elapsed_ms_per_iteration': elapsed_time_per_iteration, + # } + # wandb.run.log(tput) + return model + + + +if __name__ == "__main__": + # git_ds_info() + # pretrain(train_valid_test_datasets_provider, + # model_provider, + # ModelType.encoder_or_decoder, + # forward_step, + # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + # data_post_process=data_post_process) + import sys + import deepspeed.comm as dist + model = main() + dist.log_summary() + if wandb.run is not None: + print(f"wandb.run.name: {wandb.run.name}") + print(f"wandb.run.url: {wandb.run.url}") + wandb.finish() + sys.exit() From c861a0b67593460a88eadd2f5d8184d287daa718 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 2 Feb 2024 10:40:21 -0600 Subject: [PATCH 045/268] llama alcf --- llama.sh => llama_alcf.sh | 0 pretrain_gpt.py | 207 +------------------------------------- 2 files changed, 2 insertions(+), 205 deletions(-) rename llama.sh => llama_alcf.sh (100%) diff --git a/llama.sh b/llama_alcf.sh similarity index 100% rename from llama.sh rename to llama_alcf.sh diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 6922fb80c9..785a129156 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -27,113 +27,6 @@ from torch import nn import torch.nn.functional as F -<<<<<<< HEAD -# from ezpz import get_logger -from ezpz.dist import setup_torch, get_world_size, setup_wandb - -RANK = setup_torch( - backend='deepspeed', - port='5432', -) -WORLD_SIZE = get_world_size() -LEVEL = "DEBUG" if RANK == 0 else "CRITICAL" - -WANDB_MODE = os.environ.get('WANDB_MODE', None) -DISABLE_WANDB = WANDB_MODE is not None and str(WANDB_MODE).lower() == 'disabled' - -if RANK == 0 and not DISABLE_WANDB: - project_name = ( - os.environ.get( - 'WB_PROJECT', - os.environ.get( - 'WANDB_PROJECT', - 'GenSLM-Megatron-DS' - ), - ) - ) - print('--------------------------------------------------') - print(f"Setting up W&B from: {RANK} with {project_name}") - print('--------------------------------------------------') - setup_wandb(project_name=project_name) - - -# os.environ[''] -# wblogger = logging.getLogger("wandb") -# wblogger.setLevel(logging.DEBUG) - -# log = get_logger(__name__, level=LEVEL) -# -# log.critical(f"Hello from rank: {RANK} / {WORLD_SIZE} !") - -import socket -from typing import Optional -# log.critical(f"Setting up W&B from rank: {RANK} with {wb_project_name}") - - -# def setup_wandb(project_name: Optional[str] = None): -# print(f"Setting up W&B from: {RANK}") -# project_name = ( -# os.environ.get('WB_PROJECT', 'GenSLM-Megatron-DS') -# if project_name is None else project_name -# ) -# print(f"Setting up wandb from rank: {RANK}") -# print(f"Using: WB PROJECT: {project_name}") -# # if get_rank() == 0: -# # tensorboard_dir = args.tensorboard_dir -# tensorboard_dir = None -# # if config is None: -# tensorboard_dir = os.environ.get('TENSORBOARD_DIR', None) -# # else: -# # tensorboard_dir = ( -# # config.get( -# # 'tensorboard_dir', -# # None, # os.getcwd() -# # ) -# # ) -# if tensorboard_dir is not None: -# print(f'Patching tensorboard from {tensorboard_dir}') -# wandb.tensorboard.patch(root_logdir=tensorboard_dir) -# # wbrun_id = wandb.util.generate_id() -# current_time = time.time() -# # local_time = time.localtime(current_time) -# # if wandb.run is None: -# wandb.init( -# resume='allow', -# sync_tensorboard=(tensorboard_dir is not None), # True, -# project=(project_name if project_name is not None else None), -# # dir=(tensorboard_dir if tensorboard_dir is not None else None), -# ) -# assert wandb.run is not None -# print(f"W&B RUN: [{wandb.run.name}]({wandb.run.url})") -# wandb.run.config.update({'current_time': current_time}) -# model_size = os.environ.get('MODEL_SIZE', None) -# wandb.run.config.update({'world_size': get_world_size()}) -# # if config is not None: -# # wandb.run.config.update(config) -# env = { -# k: v for k, v in dict(os.environ).items() -# if not k.startswith('_ModuleTable') -# } -# _ = env.pop('LS_COLORS', None) -# _ = env.pop('PS1', None) -# wandb.run.config.update({'env': env}) -# hostname = socket.gethostbyaddr(socket.gethostname())[0] -# if hostname.startswith('theta'): -# wandb.run.config.update({'machine': 'ThetaGPU'}) -# elif hostname.startswith('x3'): -# wandb.run.config.update({'machine': 'Polaris'}) -# elif hostname.startswith('x1'): -# wandb.run.config.update({'machine': 'Sunspot'}) -# elif hostname.startswith('nid'): -# wandb.run.config.update({'machine': 'Perlmutter'}) -# elif hostname.startswith('login'): -# wandb.run.config.update({'machine': 'NERSC'}) -# else: -# wandb.run.config.update({'machine': hostname}) -# if model_size is not None: -# wandb.run.config.update({'MODEL_SIZE': model_size}) -======= ->>>>>>> 9c3a73dfebb812e7a494eaaa0a0dc1138dd0f922 def model_provider(pre_process=True, post_process=True): """Build the model.""" @@ -196,6 +89,7 @@ def model_provider(pre_process=True, post_process=True): see_memory_usage(f"After Building Model", force=True) return model + def get_batch(data_iterator): """Generate a batch""" args = get_args() @@ -250,7 +144,6 @@ def get_batch(data_iterator): return tokens, labels, loss_mask, attention_mask, position_ids - def data_post_process(data, data_sampler_state_dict): args = get_args() if args.data_efficiency_curriculum_learning: @@ -276,7 +169,6 @@ def data_post_process(data, data_sampler_state_dict): args.data_efficiency_curriculum_learning_seqlen_type = None return data - def get_batch_pipe(data): """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" args = get_args() @@ -362,7 +254,6 @@ def calculate_mos_loss(args, stu_output, teacher_model, tokens, position_ids, at mos_loss = mos_loss.div(args.seq_length) * beta return mos_loss - def forward_step(data_iterator, model): """Forward step.""" args = get_args() @@ -416,24 +307,8 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): print_rank_0('> building train, validation, and test datasets ' 'for GPT ...') - files = [] - if args.data_file_list is not None: - with open(args.data_file_list, 'r') as flist: - for f in flist.readlines(): - w, fname = f.split() - files.append(float(w)) - files.append(fname) - elif len(args.data_path)==1 and os.path.isdir(args.data_path[0]): - path=args.data_path[0] + "/" - for f in os.listdir(path): - if (os.path.isfile(path + f) and f.find(".bin")!=-1): - files.append(1) - files.append(path + f.split(".bin")[0]) - else: - files = args.data_path - print_rank_0(f"file list {files}") train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - data_prefix=files, + data_prefix=args.data_path, data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, @@ -476,84 +351,6 @@ def git_ds_info(): print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') -<<<<<<< HEAD -def main(): - # if RANK == 0: - # setup_wandb() - from torch.profiler import profile, record_function, ProfilerActivity - with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: - model = pretrain( - train_valid_test_datasets_provider, - model_provider, - ModelType.encoder_or_decoder, - forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, - data_post_process=data_post_process - ) - args = get_args() - prof.export_chrome_trace(f"{args.tensorboard_dir}/torch-trace-{RANK}-of-{WORLD_SIZE}.json") - - # # from megatron.training import get_model - # if wandb.run is not None: - # args = get_args() - # timers = get_timers() - # # model = get_model(model_provider, ModelType.encoder_or_decoder) - # elapsed_time = timers('interval-time').elapsed(barrier=True) - # total_iterations = os.environ.get( - # "TOTAL_ITERATIONS", - # (args.train_iters + args.eval_iters) - # ) - # seq_len = args.seq_length - # elapsed_time_per_iteration = elapsed_time / total_iterations - # if model is not None: - # samples_per_sec, tflops, approx_params_in_billions = throughput_calculator( - # model, - # args, - # elapsed_time, - # total_iterations, - # ) - # # Compute throughput. - # samples_per_sec_per_replica = samples_per_sec / args.data_parallel_size - # tokens_per_sec = samples_per_sec * seq_len - # tokens_per_sec_per_replica = tokens_per_sec / args.data_parallel_size - # sample_consumption_rate = args.consumed_train_samples / elapsed_time - # token_consumption_rate = args.consumed_train_tokens / elapsed_time - # # Tensorboard values. - # tdata = { - # # 'iteration': iteration, - # 'consumed_train_samples': args.consumed_train_samples, - # 'consumed_train_tokens': args.consumed_train_tokens, - # # 'learning_rate': learning_rate, - # # 'batch_size': batch_size, - # # 'loss_scale': loss_scale, - # # 'grad_norm': grad_norm, - # } - # # for key in loss_dict: - # # tdata[f'lm-loss/{key}'] = loss_dict[key] - # - # tdata = {f'train/{k}': v for k, v in tdata.items()} - # # if wbrun is not None and wbrun is wandb.run: - # if wandb.run is not None: - # wandb.run.log(tdata, commit=False) - # tput = { - # 'throughput/iteration-time': elapsed_time_per_iteration, # 1000 ms / s - # 'throughput/samples_per_sec': samples_per_sec, - # 'throughput/samples_per_sec_per_replica': samples_per_sec_per_replica, - # 'throughput/tokens_per_sec': tokens_per_sec, - # 'throughput/tokens_per_sec_per_replica': tokens_per_sec_per_replica, - # 'throughput/tflops': tflops, - # 'throughput/approx_params_in_billions': approx_params_in_billions, - # 'throughput/sample_consumption_rate': sample_consumption_rate, - # 'throughput/token_consumption_rate': token_consumption_rate, - # 'throughput/elapsed_ms_per_iteration': elapsed_time_per_iteration, - # } - # wandb.run.log(tput) - return model - - - -======= ->>>>>>> 9c3a73dfebb812e7a494eaaa0a0dc1138dd0f922 if __name__ == "__main__": git_ds_info() pretrain(train_valid_test_datasets_provider, From bb2bdc6c294165f8cc3cfc113043a0c1c56b5fbd Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 5 Feb 2024 11:12:18 -0600 Subject: [PATCH 046/268] update submission script --- llama_alcf.sh | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/llama_alcf.sh b/llama_alcf.sh index b62fe794ef..17e4afd713 100755 --- a/llama_alcf.sh +++ b/llama_alcf.sh @@ -15,10 +15,22 @@ export TP=1 export PP=1 export MBS=1 export BS=$((MBS*PBS_JOBSIZE*PPN/PP/TP)) +export SP=$((PBS_JOBSIZE*PPN/PP/TP)) #export DATA_PATH="/eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/" + +export export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") export DATA_PATH="/eagle/datasets//dolma/data_Llama2Tokenizer/wiki-en-simple/" export DATA_FILE_LIST="/eagle/datasets//dolma//data_file_list_small.txt" echo "BS: $BS\n PP:$PP \n TP: $TP, PBS_JOBSIZE: $PBS_JOBSIZE" + +HIDDEN_SIZE=4096 +NUM_LAYERS=32 +SEQ_LENGTH=2048 +EMBEDDINGS=2048 +TRAIN_ITERS=10 +ZERO_STAGE=2 +MODEL=LLAMA_7B +OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --hostfile $PBS_NODEFILE python3 ./pretrain_gpt.py \ --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ @@ -31,8 +43,8 @@ MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --seq-length 2048 \ --max-position-embeddings 2048 \ --train-iters 10 \ - --save ${MD}/checkpoints/LLAMA_7B_LLAMA_7B_z2_seqlen_mp1_pp1_sp24_nl32_hs4096_gb${BS}_mb1 \ - --load ${MD}/checkpoints/LLAMA_7B_LLAMA_7B_z2_seqlen_mp1_pp1_sp24_nl32_hs4096_gb${BS}_mb1 \ + --save ${MD}/checkpoints/${OUTPUT_PREFIX} \ + --load ${MD}/checkpoints/${OUTPUT_PREFIX} \ --data-impl mmap \ --tokenizer-type Llama2Tokenizer \ --split 949,50,1 \ @@ -57,7 +69,8 @@ MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --use-rotary-position-embeddings \ --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ --untie-embeddings-and-output-weights \ - --swiglu --normalization layernorm --disable-bias-linear --num-key-value-heads 4 --tensorboard-dir ${MD}/outputs/LLAMA_7B_LLAMA_7B_z3_seqlen_mp1_pp1_sp24_nl32_hs4096_gb24_mb1/tensorboard --log-timers-to-tensorboard --tensorboard-log-interval 1 \ + --swiglu --normalization layernorm --disable-bias-linear --num-key-value-heads 4 \ + --tensorboard-dir ${MD}/outputs/${OUTPUT_PREFIX}/tensorboard --log-timers-to-tensorboard --tensorboard-log-interval 1 \ --data-file-list ${DATA_FILE_LIST} \ --data-path ${DATA_PATH} \ --vocab-file ${MD}/dataset/gpt2-vocab.json --merge-file ${MD}/dataset/gpt2-merges.txt \ From 5dd69bf8ed169525aaf034b77167acd7e19fe99d Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 5 Feb 2024 13:11:50 -0600 Subject: [PATCH 047/268] fixed zero_stage input in llama_alcf.sh --- llama_alcf.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_alcf.sh b/llama_alcf.sh index 17e4afd713..a102c492fb 100755 --- a/llama_alcf.sh +++ b/llama_alcf.sh @@ -31,7 +31,7 @@ TRAIN_ITERS=10 ZERO_STAGE=2 MODEL=LLAMA_7B OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} -MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --hostfile $PBS_NODEFILE python3 ./pretrain_gpt.py \ +MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --hostfile $PBS_NODEFILE python3 ./pretrain_gpt_alcf.py \ --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ --num-layers 32 \ @@ -74,4 +74,4 @@ MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --data-file-list ${DATA_FILE_LIST} \ --data-path ${DATA_PATH} \ --vocab-file ${MD}/dataset/gpt2-vocab.json --merge-file ${MD}/dataset/gpt2-merges.txt \ - --zero-stage=2 --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed + --zero-stage=${ZERO_STAGE} --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed From 82555c8ed745ba8f92c4fa1515bd34ccdd743266 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 5 Feb 2024 16:45:58 -0600 Subject: [PATCH 048/268] Update `set_params.sh` --- set_params.sh | 54 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/set_params.sh b/set_params.sh index b6bd36038d..696c0eb4aa 100644 --- a/set_params.sh +++ b/set_params.sh @@ -13,6 +13,7 @@ export TP=${TP:-1} export PP=${PP:-1} export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) +export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} # bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh $@ @@ -21,20 +22,25 @@ export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP TP=${TP:-1} PP=${PP:-1} -# export DATA_PARENT="/home/foremans/polaris/projects/saforem2/Megatron-DeepSpeed" +export DATA_PARENT="/home/foremans/polaris/projects/saforem2/Megatron-DeepSpeed" # export DATA_TYPE="BookCorpusDataset_text_document" -export DATA_PARENT="/lus/eagle/projects/datasets/Megatron-DeepSpeed/GenSLMSubSample200k" -export DATA_TYPE="genslm_subsample_200k_sequence_document" +# export DATA_PARENT="/lus/eagle/projects/datasets/Megatron-DeepSpeed/GenSLMSubSample200k" +# export DATA_TYPE="genslm_subsample_200k_sequence_document" export DATA_DIR="${DATA_PARENT}/dataset" -export DATA_PATH="${DATA_DIR}/${DATA_TYPE}" +# export DATA_PATH="${DATA_DIR}/${DATA_TYPE}" export VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" export MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" +export DATA_PATH="/eagle/datasets//dolma/data_Llama2Tokenizer/wiki-en-simple/" +export DATA_FILE_LIST="/eagle/datasets//dolma//data_file_list_small.txt" + DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" bash ./generate_config.sh ${DS_CONFIG} || exit 1 -OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME} +OUTPUT_PREFIX="logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" +# OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME} +OUTPUT_DIR="${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" mkdir -p $OUTPUT_DIR echo "!!!Please see logs at ${OUTPUT_DIR}" @@ -51,8 +57,27 @@ if [ $PP == 1 ]; then fi ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" + +if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" + ds_args=" --deepspeed-activation-checkpointing ${ds_args}" + # --checkpoint-activations \ + # --deepspeed-activation-checkpointing +fi + +gpt_args=() + +if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" + gpt_args+=( + "--checkpoint-activations" + "--checkpoint-num-layers 1" + ) +fi # we are now using activation checkpoint provided by megatron, see below. # ds_args=" --deepspeed-activation-checkpointing ${ds_args}" +# NUM_KV_HEADS="${NUM_KV_HEADS:-0}" +# if [[ $NUM_KV_HEADS -]] # take custom args custom_args=" $@" @@ -67,6 +92,9 @@ fi NCCL=${NCCL:-nccl} +# MODEL=LLAMA_7B +# OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_tp${TP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} + run_cmd=" deepspeed $launcher pretrain_gpt_alcf.py \ --tensor-model-parallel-size $TP \ @@ -86,14 +114,9 @@ run_cmd=" --log-interval 1 \ --eval-iters 100 \ --eval-interval 100 \ - --data-path $DATA_PATH \ - --vocab-file $VOCAB_FILE \ - --merge-file $MERGE_FILE \ - --save-interval 500 \ + --save-interval 50 \ --split 100,0,0 \ --$DTYPE \ - --checkpoint-activations \ - --deepspeed-activation-checkpointing $ds_args \ --no-masked-softmax-fusion \ --no-bias-gelu-fusion \ @@ -101,6 +124,15 @@ run_cmd=" --no-gradient-accumulation-fusion \ --distributed-backend $NCCL \ --num-workers 0 \ + --tokenizer-type Llama2Tokenizer \ + --save checkpoints/${OUTPUT_PREFIX} \ + --load checkpoints/${OUTPUT_PREFIX} \ + --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ + --data-file-list ${DATA_FILE_LIST} \ + --data-path $DATA_PATH \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + ${gpt_args[*]} \ $custom_args \ |& tee $OUTPUT_DIR/output.log " From 9dc348290affa5b5f0c86e2484e42aabe824ef7e Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 5 Feb 2024 16:47:21 -0600 Subject: [PATCH 049/268] Update `pretrain_gpt_alcf.py` --- pretrain_gpt_alcf.py | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 6937bbdf2c..9737477a9e 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -37,17 +37,20 @@ import torch.nn.functional as F # from ezpz import get_logger -from ezpz.dist import setup_torch, get_world_size, setup_wandb +from ezpz.dist import get_world_size, setup_wandb, get_rank -RANK = setup_torch( - backend='deepspeed', - port='5432', -) +# RANK = setup_torch( +# backend='deepspeed', +# port='5432', +# ) +RANK = get_rank() WORLD_SIZE = get_world_size() LEVEL = "DEBUG" if RANK == 0 else "CRITICAL" WANDB_MODE = os.environ.get('WANDB_MODE', None) -DISABLE_WANDB = WANDB_MODE is not None and str(WANDB_MODE).lower() == 'disabled' +DISABLE_WANDB = ( + WANDB_MODE is not None and str(WANDB_MODE).lower() == 'disabled' +) if RANK == 0 and not DISABLE_WANDB: project_name = ( @@ -64,6 +67,7 @@ print('--------------------------------------------------') setup_wandb(project_name=project_name) + def model_provider(pre_process=True, post_process=True): """Build the model.""" @@ -83,6 +87,10 @@ def model_provider(pre_process=True, post_process=True): dpg = mpu.get_data_parallel_group() else: dpg = None + if wandb is not None and wandb.run is not None: + assert wandb is not None and wandb.run is not None + print(f'Updating {wandb.run.name=} at {wandb.run.url=}') + wandb.run.config.update({'args': vars(args)}) with deepspeed.zero.Init(data_parallel_group=dpg, remote_device=None if args.remote_device == 'none' else args.remote_device, config_dict_or_path=args.deepspeed_config_dict, @@ -427,18 +435,17 @@ def main(): # if RANK == 0: # setup_wandb() from torch.profiler import profile, record_function, ProfilerActivity - with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: - model = pretrain( - train_valid_test_datasets_provider, - model_provider, - ModelType.encoder_or_decoder, - forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, - data_post_process=data_post_process - ) - args = get_args() - prof.export_chrome_trace(f"{args.tensorboard_dir}/torch-trace-{RANK}-of-{WORLD_SIZE}.json") - + # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: + model = pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + data_post_process=data_post_process + ) + # args = get_args() + # prof.export_chrome_trace(f"{args.tensorboard_dir}/torch-trace-{RANK}-of-{WORLD_SIZE}.json") # # from megatron.training import get_model # if wandb.run is not None: # args = get_args() From 41208b4b99e40f6e930ed76c76d7cfd1254fe4a9 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 6 Feb 2024 00:35:02 -0600 Subject: [PATCH 050/268] Update `.gitignore` --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index fc81539941..610af67ac3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +outputs/ +venvs/ +wandb/ +llama-logs/ +checkpoints/ *.gz *.txt *.idx From 42f5eb93265922bde57d02d33f61ac55311b74c1 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 7 Feb 2024 11:22:24 -0600 Subject: [PATCH 051/268] Update `set_params.sh` --- set_params.sh | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/set_params.sh b/set_params.sh index 696c0eb4aa..41ef9b4908 100644 --- a/set_params.sh +++ b/set_params.sh @@ -1,12 +1,16 @@ #!/bin/bash login # echo "!!!please use generate_hostfile.sh to set hostfile for 18 nodes before training" -export WORLD_SIZE=${WORLD_SIZE:-216} +export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${PBS_NODEFILE}")} export MICRO_BATCH=${MICRO_BATCH:-1} export NLAYERS=${NLAYERS:-96} export HIDDEN=${HIDDEN:-12288} export HEADS=${HEADS:-96} -export SEQ=${SEQ:-2048} +export LR=${LR:-0.0003} +export SEQ=${SEQ:-4096} export TRAIN_ITER=${TRAIN_ITER:-20} +export EVAL_ITERS=${EVAL_ITERS:-100} +export SAVE_INTERVAL=${SAVE_INTERVAL:-50} +export EVAL_INTERVAL=${EVAL_INTERVAL:-50} export ZERO_STAGE=${ZERO_STAGE:-3} export DTYPE=${DTYPE:-fp16} export TP=${TP:-1} @@ -22,17 +26,20 @@ export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} TP=${TP:-1} PP=${PP:-1} -export DATA_PARENT="/home/foremans/polaris/projects/saforem2/Megatron-DeepSpeed" +# export DATA_PARENT="/home/foremans/polaris/projects/saforem2/Megatron-DeepSpeed" # export DATA_TYPE="BookCorpusDataset_text_document" # export DATA_PARENT="/lus/eagle/projects/datasets/Megatron-DeepSpeed/GenSLMSubSample200k" # export DATA_TYPE="genslm_subsample_200k_sequence_document" -export DATA_DIR="${DATA_PARENT}/dataset" +# export DATA_DIR="${DATA_PARENT}/dataset" # export DATA_PATH="${DATA_DIR}/${DATA_TYPE}" -export VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" -export MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" +# export VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" +# export MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" -export DATA_PATH="/eagle/datasets//dolma/data_Llama2Tokenizer/wiki-en-simple/" -export DATA_FILE_LIST="/eagle/datasets//dolma//data_file_list_small.txt" +export DATA_PATH="/eagle/datasets/dolma/data_Llama2Tokenizer/wiki-en-simple/" +# export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_select.txt" +export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_select_only_rust.txt" +# export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_select_modified.txt" +# export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_small.txt" DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" @@ -41,7 +48,7 @@ bash ./generate_config.sh ${DS_CONFIG} || exit 1 OUTPUT_PREFIX="logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" # OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME} OUTPUT_DIR="${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" -mkdir -p $OUTPUT_DIR +mkdir -p "${OUTPUT_DIR}" echo "!!!Please see logs at ${OUTPUT_DIR}" # Hostfile path @@ -95,6 +102,9 @@ NCCL=${NCCL:-nccl} # MODEL=LLAMA_7B # OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_tp${TP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} +# --vocab-file $VOCAB_FILE \ +# --merge-file $MERGE_FILE \ +# --lr-decay-iters 320000 \ run_cmd=" deepspeed $launcher pretrain_gpt_alcf.py \ --tensor-model-parallel-size $TP \ @@ -107,15 +117,16 @@ run_cmd=" --micro-batch-size $MICRO_BATCH \ --global-batch-size $GLOBAL_BATCH \ --train-iters $TRAIN_ITER \ - --lr 0.00015 \ + --lr ${LR} \ + --min-lr 1.0e-5 \ --lr-warmup-fraction .01 \ --lr-decay-iters 320000 \ --lr-decay-style cosine \ --log-interval 1 \ - --eval-iters 100 \ - --eval-interval 100 \ - --save-interval 50 \ - --split 100,0,0 \ + --eval-iters ${EVAL_ITERS} \ + --eval-interval ${EVAL_INTERVAL} \ + --save-interval ${SAVE_INTERVAL} \ + --split 90,5,5 \ --$DTYPE \ $ds_args \ --no-masked-softmax-fusion \ @@ -127,16 +138,18 @@ run_cmd=" --tokenizer-type Llama2Tokenizer \ --save checkpoints/${OUTPUT_PREFIX} \ --load checkpoints/${OUTPUT_PREFIX} \ + --use-checkpoint-opt_param-scheduler \ --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ --data-file-list ${DATA_FILE_LIST} \ --data-path $DATA_PATH \ - --vocab-file $VOCAB_FILE \ - --merge-file $MERGE_FILE \ ${gpt_args[*]} \ $custom_args \ |& tee $OUTPUT_DIR/output.log " +echo "Using $(which deepspeed)" +ds_report + echo ${run_cmd} eval ${run_cmd} set +x From 8b6d8bfef60932de31f9b02eccb0d18709e6e46e Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Thu, 8 Feb 2024 15:44:38 -0600 Subject: [PATCH 052/268] adding master_addr in case it doesn --- llama_alcf.sh | 11 +++++------ pretrain_gpt_alcf.py | 12 ++++++++++++ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/llama_alcf.sh b/llama_alcf.sh index a102c492fb..dcfa41d109 100755 --- a/llama_alcf.sh +++ b/llama_alcf.sh @@ -20,7 +20,7 @@ export SP=$((PBS_JOBSIZE*PPN/PP/TP)) export export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") export DATA_PATH="/eagle/datasets//dolma/data_Llama2Tokenizer/wiki-en-simple/" -export DATA_FILE_LIST="/eagle/datasets//dolma//data_file_list_small.txt" +export DATA_FILE_LIST="/eagle/datasets//dolma//data_file_list_select.txt" echo "BS: $BS\n PP:$PP \n TP: $TP, PBS_JOBSIZE: $PBS_JOBSIZE" HIDDEN_SIZE=4096 @@ -34,18 +34,17 @@ OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAY MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --hostfile $PBS_NODEFILE python3 ./pretrain_gpt_alcf.py \ --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ - --num-layers 32 \ - --hidden-size 4096 \ + --num-layers ${NUM_LAYERS} \ + --hidden-size ${HIDDEN_SIZE} \ --ffn-hidden-size 5504 \ --num-attention-heads 32 \ --micro-batch-size ${MBS} \ --global-batch-size ${BS} \ - --seq-length 2048 \ - --max-position-embeddings 2048 \ + --seq-length ${SEQ_LENGTH} \ + --max-position-embeddings ${EMBEDDINGS} \ --train-iters 10 \ --save ${MD}/checkpoints/${OUTPUT_PREFIX} \ --load ${MD}/checkpoints/${OUTPUT_PREFIX} \ - --data-impl mmap \ --tokenizer-type Llama2Tokenizer \ --split 949,50,1 \ --distributed-backend nccl \ diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 6937bbdf2c..70c656f9f1 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -426,6 +426,18 @@ def git_ds_info(): def main(): # if RANK == 0: # setup_wandb() + from mpi4py import MPI + rank = MPI.COMM_WORLD.rank + + if rank == 0: + master_addr = socket.gethostname() + else: + master_addr = None + + master_addr = MPI.COMM_WORLD.bcast(master_addr, root=0) + os.environ["MASTER_ADDR"] = master_addr + os.environ["MASTER_PORT"] = str(2345) + from torch.profiler import profile, record_function, ProfilerActivity with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: model = pretrain( From 9030d779686d83dc41106620b04b739a2130fda4 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Thu, 8 Feb 2024 16:08:08 -0600 Subject: [PATCH 053/268] added socket --- pretrain_gpt_alcf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 70c656f9f1..536bdca110 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -426,6 +426,7 @@ def git_ds_info(): def main(): # if RANK == 0: # setup_wandb() + import socket from mpi4py import MPI rank = MPI.COMM_WORLD.rank From 7797c2f930166470c77fae258c89bce474dd28e7 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 9 Feb 2024 11:03:35 -0600 Subject: [PATCH 054/268] Update `generate_config.sh` --- generate_config.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/generate_config.sh b/generate_config.sh index 210220e192..6a676fa1b1 100644 --- a/generate_config.sh +++ b/generate_config.sh @@ -21,9 +21,12 @@ common="\ \"steps_per_print\": 1, \"gradient_accumulation_steps\": $GRAD_ACC_STEPS, \"optimizer\": { - \"type\": \"Adam\", + \"type\": \"AdamW\", \"params\": { - \"lr\": 0.00015, + \"lr\": ${LR}, + \"beta1\": 0.9, + \"beta2\": 0.95, + \"eps\": 1e-5, \"weight_decay\": 1e-2 } }, From d1367d43b2461743a3faec501d9e258afce00f6e Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sat, 10 Feb 2024 20:23:08 -0600 Subject: [PATCH 055/268] Update `set_params.sh,generate_config.sh` --- generate_config.sh | 10 +++++++++- set_params.sh | 23 +++++++++++++---------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/generate_config.sh b/generate_config.sh index 6a676fa1b1..6bea420a2a 100644 --- a/generate_config.sh +++ b/generate_config.sh @@ -27,7 +27,15 @@ common="\ \"beta1\": 0.9, \"beta2\": 0.95, \"eps\": 1e-5, - \"weight_decay\": 1e-2 + \"weight_decay\": 1e-1 + } + }, + \"scheduler\": { + \"type\": \"WarmupLR\", + \"params\": { + \"warmup_min_lr\": 0.00003, + \"warmup_max_lr\": 0.0003, + \"warmup_num_steps\": 5000 } }, \"zero_allow_untested_optimizer\": true, diff --git a/set_params.sh b/set_params.sh index 41ef9b4908..c87524db13 100644 --- a/set_params.sh +++ b/set_params.sh @@ -7,13 +7,13 @@ export HIDDEN=${HIDDEN:-12288} export HEADS=${HEADS:-96} export LR=${LR:-0.0003} export SEQ=${SEQ:-4096} -export TRAIN_ITER=${TRAIN_ITER:-20} -export EVAL_ITERS=${EVAL_ITERS:-100} -export SAVE_INTERVAL=${SAVE_INTERVAL:-50} -export EVAL_INTERVAL=${EVAL_INTERVAL:-50} -export ZERO_STAGE=${ZERO_STAGE:-3} +export TRAIN_ITER=${TRAIN_ITER:-300000} +export EVAL_ITERS=${EVAL_ITERS:-50} +export SAVE_INTERVAL=${SAVE_INTERVAL:-1000} +export EVAL_INTERVAL=${EVAL_INTERVAL:-50000} +export ZERO_STAGE=${ZERO_STAGE:-2} export DTYPE=${DTYPE:-fp16} -export TP=${TP:-1} +export TP=${TP:-2} export PP=${PP:-1} export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) @@ -36,8 +36,9 @@ PP=${PP:-1} # export MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" export DATA_PATH="/eagle/datasets/dolma/data_Llama2Tokenizer/wiki-en-simple/" +export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_select_3280.txt" # export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_select.txt" -export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_select_only_rust.txt" +# export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_select_only_rust.txt" # export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_select_modified.txt" # export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_small.txt" @@ -118,10 +119,11 @@ run_cmd=" --global-batch-size $GLOBAL_BATCH \ --train-iters $TRAIN_ITER \ --lr ${LR} \ - --min-lr 1.0e-5 \ - --lr-warmup-fraction .01 \ - --lr-decay-iters 320000 \ + --lr-warmup-iters 5000 \ + --lr-decay-iters 10000 \ + --ffn-hidden-size 11008 \ --lr-decay-style cosine \ + --data-impl mmap \ --log-interval 1 \ --eval-iters ${EVAL_ITERS} \ --eval-interval ${EVAL_INTERVAL} \ @@ -139,6 +141,7 @@ run_cmd=" --save checkpoints/${OUTPUT_PREFIX} \ --load checkpoints/${OUTPUT_PREFIX} \ --use-checkpoint-opt_param-scheduler \ + --accumulate-allreduce-grads-in-fp32 \ --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ --data-file-list ${DATA_FILE_LIST} \ --data-path $DATA_PATH \ From 615faa46cde9e2a5ecbae3cccb8dcab9ee3a8af8 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 13 Feb 2024 09:25:23 -0600 Subject: [PATCH 056/268] Add `train_llama_alcf_chunk_00_of_20.sh` --- train_llama_alcf_chunk_00_of_20.sh | 53 ++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100755 train_llama_alcf_chunk_00_of_20.sh diff --git a/train_llama_alcf_chunk_00_of_20.sh b/train_llama_alcf_chunk_00_of_20.sh new file mode 100755 index 0000000000..15e4e789aa --- /dev/null +++ b/train_llama_alcf_chunk_00_of_20.sh @@ -0,0 +1,53 @@ +#!/bin/bash --login +#PBS -l walltime=06:00:00 +#PBS -A argonne_tpc +#PBS -q prod +#PBS -l select=48 +#PBS -l filesystems=eagle:home + +cd "${PBS_O_WORKDIR}" || exit +if [[ ! -d ezpz ]]; then + git clone https://github.com/saforem2/ezpz +else + echo "Found ezpz!" +fi +source ezpz/src/ezpz/bin/savejobenv > /tmp/savejobenv.log 2>&1 || exit +source ezpz/src/ezpz/bin/getjobenv || exit + +module load conda/2023-10-04; conda activate base +echo "WORLD_SIZE:${WORLD_SIZE}" +echo "NCCL: ${NCCL:-nccl}" + +export PP=1 +export TP=2 + +export HEADS=32 +export NLAYERS=32 +export HIDDEN=4096 +export NUM_KV_HEAD=8 + +export ZERO_STAGE=2 +export MICRO_BATCH=8 +export GRAD_ACC_STEPS=1 +export SEQ=4096 +export DTYPE=fp16 + +export EVAL_ITERS=20 +export TRAIN_ITER=317892 +export SAVE_INTERVAL=5000 +export EVAL_INTERVAL=50000 + +export DATA_PATH="/eagle/datasets/dolma/data_Llama2Tokenizer/wiki-en-simple/" +export DATA_FILE_LIST="/eagle/datasets/dolma/chunks/data_file_list_chunk_0_of_20.txt" + +echo "++++++++++++++++++++++++++++++++++++++++++++++++++" +echo "Using DATA_FILE_LIST: ${DATA_FILE_LIST}" +echo "++++++++++++++++++++++++++++++++++++++++++++++++++" + +export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" +echo "MODEL_TYPE: ${MODEL_TYPE}" + +export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" +export USE_ACTIVATION_CHECKPOINTING=1 ; [ -n "${MODEL_TYPE}" ] + +bash set_params.sh "$LLAMA_ARGS" --use-flash-attn-v2 --num-key-value-heads $NUM_KV_HEAD |& tee "train-${MODEL_TYPE}-mbs-${MICRO_BATCH}-zs${ZERO_STAGE}-kvh${NUM_KV_HEAD}-$(tstamp).log" From ce5d89611f673582786f95cb3cd5660c41367a68 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Wed, 14 Feb 2024 12:49:30 -0600 Subject: [PATCH 057/268] added profile as an option --- megatron/arguments.py | 1 + pretrain_gpt_alcf.py | 22 ++++++++++++++++------ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 083bf30f6a..39326218e0 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -952,6 +952,7 @@ def _add_training_args(parser): dest='gradient_accumulation_fusion') group.add_argument('--use-dataset-only', type=bool, required=False, default=False, help='If set to True, only use the megatron dataset for external trainer ') + group.add_argument('--profile', action='store_true', help='Enable Torch Profiler') return parser diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 536bdca110..0f1fcb1ad1 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -438,9 +438,22 @@ def main(): master_addr = MPI.COMM_WORLD.bcast(master_addr, root=0) os.environ["MASTER_ADDR"] = master_addr os.environ["MASTER_PORT"] = str(2345) - - from torch.profiler import profile, record_function, ProfilerActivity - with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: + args = get_args() + + if (args.profile): + from torch.profiler import profile, record_function, ProfilerActivity + with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: + model = pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + data_post_process=data_post_process + ) + + prof.export_chrome_trace(f"{args.tensorboard_dir}/torch-trace-{RANK}-of-{WORLD_SIZE}.json") + else: model = pretrain( train_valid_test_datasets_provider, model_provider, @@ -449,9 +462,6 @@ def main(): args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, data_post_process=data_post_process ) - args = get_args() - prof.export_chrome_trace(f"{args.tensorboard_dir}/torch-trace-{RANK}-of-{WORLD_SIZE}.json") - # # from megatron.training import get_model # if wandb.run is not None: # args = get_args() From 8280d0172863f6c1e0aad22b08091cbe204f24cf Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 19 Feb 2024 09:54:26 -0600 Subject: [PATCH 058/268] Update `train_llama_alcf_chunk-00-of-20.sh` --- ...0.sh => train_llama_alcf_chunk_00-of-20.sh | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) rename train_llama_alcf_chunk_00_of_20.sh => train_llama_alcf_chunk_00-of-20.sh (52%) diff --git a/train_llama_alcf_chunk_00_of_20.sh b/train_llama_alcf_chunk_00-of-20.sh similarity index 52% rename from train_llama_alcf_chunk_00_of_20.sh rename to train_llama_alcf_chunk_00-of-20.sh index 15e4e789aa..46a00aea52 100755 --- a/train_llama_alcf_chunk_00_of_20.sh +++ b/train_llama_alcf_chunk_00-of-20.sh @@ -6,18 +6,15 @@ #PBS -l filesystems=eagle:home cd "${PBS_O_WORKDIR}" || exit +module load conda/2023-10-04; conda activate base if [[ ! -d ezpz ]]; then git clone https://github.com/saforem2/ezpz else echo "Found ezpz!" fi -source ezpz/src/ezpz/bin/savejobenv > /tmp/savejobenv.log 2>&1 || exit +source ezpz/src/ezpz/bin/savejobenv || exit source ezpz/src/ezpz/bin/getjobenv || exit -module load conda/2023-10-04; conda activate base -echo "WORLD_SIZE:${WORLD_SIZE}" -echo "NCCL: ${NCCL:-nccl}" - export PP=1 export TP=2 @@ -34,20 +31,30 @@ export DTYPE=fp16 export EVAL_ITERS=20 export TRAIN_ITER=317892 -export SAVE_INTERVAL=5000 -export EVAL_INTERVAL=50000 +export SAVE_INTERVAL=200 +# export EVAL_INTERVAL=1000 export DATA_PATH="/eagle/datasets/dolma/data_Llama2Tokenizer/wiki-en-simple/" -export DATA_FILE_LIST="/eagle/datasets/dolma/chunks/data_file_list_chunk_0_of_20.txt" - -echo "++++++++++++++++++++++++++++++++++++++++++++++++++" -echo "Using DATA_FILE_LIST: ${DATA_FILE_LIST}" -echo "++++++++++++++++++++++++++++++++++++++++++++++++++" +export DATA_FILE_LIST="/lus/eagle/projects/datasets/dolma/chunks/20/data_file_list_chunk_0_of_20.txt" +# export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_select_5.txt" +# export DATA_FILE_LIST="/eagle/datasets/dolma/chunks/40/data_file_list_chunk_0_of_40.txt" +# export DATA_FILE_LIST="/eagle/datasets/dolma/chunks/data_file_list_chunk_0_of_20.txt" export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" -echo "MODEL_TYPE: ${MODEL_TYPE}" export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" -export USE_ACTIVATION_CHECKPOINTING=1 ; [ -n "${MODEL_TYPE}" ] +export USE_ACTIVATION_CHECKPOINTING=1 ; +export EXTRA_ARGS="--use-flash-attn-v2 --num-key-value-heads ${NUM_KV_HEAD}" + +echo "++++++++++++++++++++++++++++++++++++++++++++++++++" +echo "- WORLD_SIZE:${WORLD_SIZE}" +echo "- NCCL: ${NCCL:-nccl}" +echo "- MODEL_TYPE: ${MODEL_TYPE}" +echo "- Using DATA_FILE_LIST: ${DATA_FILE_LIST}" +echo "++++++++++++++++++++++++++++++++++++++++++++++++++" + + # [ -n "${MODEL_TYPE}" ] && + EXEC="./set_params.sh" + OUTPUT="train-${MODEL_TYPE}-mbs-${MICRO_BATCH}-zs${ZERO_STAGE}-kvh${NUM_KV_HEAD}-$(tstamp).log" -bash set_params.sh "$LLAMA_ARGS" --use-flash-attn-v2 --num-key-value-heads $NUM_KV_HEAD |& tee "train-${MODEL_TYPE}-mbs-${MICRO_BATCH}-zs${ZERO_STAGE}-kvh${NUM_KV_HEAD}-$(tstamp).log" + [ -f "${EXEC}" ] && bash "${EXEC}" "${LLAMA_ARGS}" "${EXTRA_ARGS}" |& tee "${OUTPUT}" From 63385a8c4a4114ecb81483b5997409977c064ee4 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 19 Feb 2024 10:03:02 -0600 Subject: [PATCH 059/268] Update `megatron/data/blendable_dataset.py` --- megatron/data/blendable_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/data/blendable_dataset.py b/megatron/data/blendable_dataset.py index 2516e58415..78ab7cb005 100644 --- a/megatron/data/blendable_dataset.py +++ b/megatron/data/blendable_dataset.py @@ -34,8 +34,8 @@ def __init__(self, datasets, weights, size, *, # Build indicies. def _build_indices(): start_time = time.time() - assert num_datasets < 255 - dataset_index = np.zeros(self.size, dtype=np.uint8) + # assert num_datasets < 255 + dataset_index = np.zeros(self.size, dtype=np.int64) dataset_sample_index = np.zeros(self.size, dtype=np.int64) from megatron.data import helpers From 74e403f7432e452a4b10a925786a4abf9428f6a0 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 19 Feb 2024 10:31:40 -0600 Subject: [PATCH 060/268] Update `set_params.sh` --- set_params.sh | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/set_params.sh b/set_params.sh index c87524db13..c57eea34cb 100644 --- a/set_params.sh +++ b/set_params.sh @@ -26,25 +26,8 @@ export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} TP=${TP:-1} PP=${PP:-1} -# export DATA_PARENT="/home/foremans/polaris/projects/saforem2/Megatron-DeepSpeed" -# export DATA_TYPE="BookCorpusDataset_text_document" -# export DATA_PARENT="/lus/eagle/projects/datasets/Megatron-DeepSpeed/GenSLMSubSample200k" -# export DATA_TYPE="genslm_subsample_200k_sequence_document" -# export DATA_DIR="${DATA_PARENT}/dataset" -# export DATA_PATH="${DATA_DIR}/${DATA_TYPE}" -# export VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" -# export MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" - -export DATA_PATH="/eagle/datasets/dolma/data_Llama2Tokenizer/wiki-en-simple/" -export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_select_3280.txt" -# export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_select.txt" -# export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_select_only_rust.txt" -# export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_select_modified.txt" -# export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_small.txt" - - DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" -bash ./generate_config.sh ${DS_CONFIG} || exit 1 +bash ./generate_config.sh "${DS_CONFIG}" || exit 1 OUTPUT_PREFIX="logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" # OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME} @@ -55,12 +38,12 @@ echo "!!!Please see logs at ${OUTPUT_DIR}" # Hostfile path hostfile_deepspeed=./hostfile_deepspeed hostfile_mpich=./hostfile_mpich -cat $PBS_NODEFILE > hostfile_mpich -cat $PBS_NODEFILE > hostfile_deepspeed ; sed -e 's/$/ slots=4/' -i hostfile_deepspeed +cat "$PBS_NODEFILE" > hostfile_mpich +cat "$PBS_NODEFILE" > hostfile_deepspeed ; sed -e 's/$/ slots=4/' -i hostfile_deepspeed ds_args=" " ds_args=" --deepspeed ${ds_args}" -if [ $PP == 1 ]; then +if [ "$PP" == 1 ]; then ds_args=" --no-pipeline-parallel ${ds_args}" fi ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" From cc6c8805452f4a8185736003ba64bd4648eec170 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 19 Feb 2024 10:31:56 -0600 Subject: [PATCH 061/268] Update `megatron/training.py` --- megatron/training.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 29d2181d46..ef32cd3856 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -1066,21 +1066,6 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, elapsed_time_per_iteration, args.consumed_train_samples) writer.add_scalar('iteration-time/iteration-time vs tokens', elapsed_time_per_iteration, args.consumed_train_tokens) - if wandb is not None and getattr(wandb, 'run', None) is not None: - wandb_metrics |= { - 'iteration': iteration, - 'iteration_time': elapsed_time_per_iteration, - 'iteration_time_vs_tokens': ( - (elapsed_time_per_iteration - / args.consumed_train_tokens) - ), - 'iteration_time_vs_samples': ( - (elapsed_time_per_iteration - / args.consumed_train_samples), - ), - } - if wandb is not None and getattr(wandb, 'run', None) is not None: - wandb.log(wandb_metrics) log_string = ' iteration {:8d}/{:8d} |'.format( iteration, args.train_iters) log_string += ' consumed samples: {:12d} |'.format( @@ -1091,6 +1076,21 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, elapsed_time_per_iteration * 1000.0) log_string += ' learning rate: {:.3E} |'.format(learning_rate) log_string += ' global batch size: {:5d} |'.format(batch_size) + if wandb is not None and getattr(wandb, 'run', None) is not None: + wandb_metrics |= { + 'training/iteration': iteration, + 'training/iteration_time': elapsed_time_per_iteration, + 'training/iteration_time_vs_tokens': ( + (elapsed_time_per_iteration + / args.consumed_train_tokens) + ), + 'training/iteration_time_vs_samples': ( + (elapsed_time_per_iteration + / args.consumed_train_samples), + ), + 'training/consumed_samples': args.consumed_train_samples, + 'training/consumed_tokens': args.consumed_train_tokens, + } for key in total_loss_dict: if key not in [advanced_iters_key, skipped_iters_key, nan_iters_key]: @@ -1099,6 +1099,8 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, if avg > 0.0: log_string += ' {}: {:.6E} |'.format(key, avg) total_loss_dict[key] = get_accelerator().FloatTensor([0.0]) + if wandb is not None and getattr(wandb, 'run', None) is not None: + wandb.log(wandb_metrics) if loss_scale is not None: log_string += ' loss scale: {:.1f} |'.format(loss_scale) if grad_norm is not None: From ead0bc6d5197f8f3c40b0d29a0212c098d307e4b Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 19 Feb 2024 10:32:33 -0600 Subject: [PATCH 062/268] Update `megatron/data/indexed_dataset.py` --- megatron/data/indexed_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py index 219ffe8031..62ebdc9813 100644 --- a/megatron/data/indexed_dataset.py +++ b/megatron/data/indexed_dataset.py @@ -504,6 +504,7 @@ def _do_init(self, path, skip_warmup): print_rank_0(" warming up data mmap file...") _warmup_mmap_file(data_file_path(self._path)) print_rank_0(" creating numpy buffer of mmap...") + print_rank_0(data_file_path(self._path)) self._bin_buffer_mmap = np.memmap(data_file_path(self._path), mode='r', order='C') print_rank_0(" creating memory view of numpy buffer...") self._bin_buffer = memoryview(self._bin_buffer_mmap) From 9cde45ad366797b2d1c7b7db8d01cf97add83e0c Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 19 Feb 2024 10:33:37 -0600 Subject: [PATCH 063/268] Update `megatron/data/gpt_dataset.py` --- megatron/data/gpt_dataset.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py index 1d9b7e1c1d..210a92c85e 100644 --- a/megatron/data/gpt_dataset.py +++ b/megatron/data/gpt_dataset.py @@ -273,7 +273,31 @@ def __getitem__(self, idx): args = get_args() orig_idx = idx # Get the shuffled index. - idx = self.shuffle_idx[idx] + try: + idx = self.shuffle_idx[idx] + except IndexError as exc: + if is_rank_0(): + import json + from rich import print_json + print(exc) + print( + '\n'.join( + ['-------------------------------------------------', + f'Trying to access {idx=} from self.shuffle_idx,', + f'but {len(self.shuffle_idx)=}', + '-------------------------------------------------'] + ) + ) + print_json( + json.dumps( + { + 'doc_idx': len(self.doc_idx), + 'sample_idx': len(self.sample_idx), + 'shuffle_idx': len(self.shuffle_idx), + }, + indent=4, + ) + ) # Start and end documents and offsets. doc_index_f = self.sample_idx[idx][0] doc_index_l = self.sample_idx[idx + 1][0] From bffa9f89e8237311fcfa265e138ff5edd49bab7a Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 20 Feb 2024 23:45:34 -0600 Subject: [PATCH 064/268] Update `pretrain_gpt_alcf.py` --- pretrain_gpt_alcf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 9737477a9e..36ef5d153f 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -58,7 +58,7 @@ 'WB_PROJECT', os.environ.get( 'WANDB_PROJECT', - 'GenSLM-Megatron-DS' + 'AuroraGPT' ), ) ) From 0a427230553de99d6db1563f9c1503799adc11c8 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 20 Feb 2024 23:45:49 -0600 Subject: [PATCH 065/268] Update `set_params.sh` --- set_params.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/set_params.sh b/set_params.sh index c57eea34cb..8682a86907 100644 --- a/set_params.sh +++ b/set_params.sh @@ -19,6 +19,8 @@ export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} +# echo "USING DATA_FILE_LIST: ${DATA_FILE_LIST}" || exit + # bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh $@ @@ -127,7 +129,6 @@ run_cmd=" --accumulate-allreduce-grads-in-fp32 \ --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ --data-file-list ${DATA_FILE_LIST} \ - --data-path $DATA_PATH \ ${gpt_args[*]} \ $custom_args \ |& tee $OUTPUT_DIR/output.log From 2bf20836a7b1610c567231571ebcdbc5d5b0e1f7 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 21 Feb 2024 11:29:48 -0600 Subject: [PATCH 066/268] Add `train_llama_alcf_polaris.sh` --- train_llama_alcf_polaris.sh | 193 ++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 train_llama_alcf_polaris.sh diff --git a/train_llama_alcf_polaris.sh b/train_llama_alcf_polaris.sh new file mode 100644 index 0000000000..a06bfcd194 --- /dev/null +++ b/train_llama_alcf_polaris.sh @@ -0,0 +1,193 @@ +#!/bin/bash --login +#PBS -l walltime=06:00:00 +#PBS -A argonne_tpc +#PBS -q prod +#PBS -l select=48 +#PBS -l filesystems=eagle:home + +cd "${PBS_O_WORKDIR}" || exit +module load conda/2023-10-04; conda activate base +if [[ ! -d ezpz ]]; then + git clone https://github.com/saforem2/ezpz +else + echo "Found ezpz!" +fi +source ezpz/src/ezpz/bin/savejobenv > /tmp/savejobenv.log 2>&1 || exit +source ezpz/src/ezpz/bin/getjobenv || exit + +# ---- Parallelism Settings ---- +export PP=${PP:-1} +export TP=${TP:-2} +# ------------------------------ + +HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" +export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} + +# ---- Llama2 7B Config ----------------------- +export HEADS=${HEADS:-32} +export NLAYERS=${NLAYERS:-32} +export HIDDEN=${HIDDEN:-4096} +export NUM_KV_HEAD=${NUM_KV_HEAD:-8} +# --------------------------------------------- + +# ---- Run Settings ------------------------------------------ +export LR=${LR:-0.00015} +export SEQ=${SEQ:-4096} # SEQ_LEN: 4096 +export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 +export ZERO_STAGE=${ZERO_STAGE:-2} +export MICRO_BATCH=${MICRO_BATCH:-8} +export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} +export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) +# --------------------------------------------- + +export EVAL_ITERS=${EVAL_ITERS:-20} +export TRAIN_ITER=${TRAIN_ITER:-317892} +export SAVE_INTERVAL=${SAVE_INTERVAL:-200} +export EVAL_INTERVAL=${EVAL_INTERVAL:-50000} +export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} +# export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} + +# export DATA_FILE_LIST="/lus/eagle/projects/datasets/dolma/chunks/40/data_file_list_chunk_0_of_40.txt" +# export DATA_FILE_LIST="/lus/eagle/projects/datasets/dolma/chunks/10/data_file_list_chunk_0_of_10.txt" +export DATA_FILE_LIST="./dolma_data_file_list-00-of-04.txt" +# export DATA_FILE_LIST="./dolma-chunk-00-of-40.txt" + +export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" + +export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" +export EXTRA_ARGS="--use-flash-attn-v2 --num-key-value-heads ${NUM_KV_HEAD}" + +# export DATA_CACHE_PATH="${DATA_CACHE_PATH}" +if [[ -n "$DATA_CACHE_PATH" ]]; then + echo "Using DATA_CACHE_PATH: ${DATA_CACHE_PATH}" + EXTRA_ARGS="${EXTRA_ARGS} --data-cache-path ${DATA_CACHE_PATH}" +else + echo "Not using DATA_CACHE_PATH !!" +fi + +echo "++++++++++++++++++++++++++++++++++++++++++++++++++" +echo "- WORLD_SIZE:${WORLD_SIZE}" +echo "- NCCL: ${NCCL:-nccl}" +echo "- MODEL_TYPE: ${MODEL_TYPE}" +echo "- Using DATA_FILE_LIST: ${DATA_FILE_LIST}" +echo "++++++++++++++++++++++++++++++++++++++++++++++++++" + + +# bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh $@ + +DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" +bash ./generate_config.sh "${DS_CONFIG}" || exit 1 + +OUTPUT_PREFIX="logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" +# OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME} +OUTPUT_DIR="${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" +mkdir -p "${OUTPUT_DIR}" +echo "!!!Please see logs at ${OUTPUT_DIR}" + +# Hostfile path +hostfile_deepspeed=./hostfile_deepspeed +hostfile_mpich=./hostfile_mpich +cat "$PBS_NODEFILE" > hostfile_mpich +cat "$PBS_NODEFILE" > hostfile_deepspeed ; sed -e 's/$/ slots=4/' -i hostfile_deepspeed + +ds_args=" " +ds_args=" --deepspeed ${ds_args}" +if [ "$PP" == 1 ]; then + ds_args=" --no-pipeline-parallel ${ds_args}" +fi +ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" +ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" + +if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" + ds_args=" --deepspeed-activation-checkpointing ${ds_args}" + # --checkpoint-activations \ + # --deepspeed-activation-checkpointing +fi + +gpt_args=() + +if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" + gpt_args+=( + "--checkpoint-activations" + "--checkpoint-num-layers 1" + ) +fi +# we are now using activation checkpoint provided by megatron, see below. +# ds_args=" --deepspeed-activation-checkpointing ${ds_args}" +# NUM_KV_HEADS="${NUM_KV_HEADS:-0}" +# if [[ $NUM_KV_HEADS -]] + +# take custom args +custom_args=" $@" + +# launcher setting +LAUNCHER=${LAUNCHER:-MPICH} +if [[ $LAUNCHER == "deepspeed" ]]; then + launcher="" +else + launcher="--force_multi --hostfile $hostfile_deepspeed --launcher=${LAUNCHER} --launcher_args='-hostfile ${hostfile_mpich}'" +fi + +NCCL=${NCCL:-nccl} +EXEC="./pretrain_gpt_alcf.py" + +# MODEL=LLAMA_7B +# OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_tp${TP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} + +# --vocab-file $VOCAB_FILE \ +# --merge-file $MERGE_FILE \ +# --lr-decay-iters 320000 \ + # --num-workers 0 \ +run_cmd=" + deepspeed $launcher ${EXEC} \ + --tensor-model-parallel-size $TP \ + --pipeline-model-parallel-size $PP \ + --num-layers $NLAYERS \ + --hidden-size $HIDDEN \ + --num-attention-heads $HEADS \ + --seq-length $SEQ \ + --max-position-embeddings $SEQ \ + --micro-batch-size $MICRO_BATCH \ + --global-batch-size $GLOBAL_BATCH \ + --train-iters $TRAIN_ITER \ + --lr ${LR} \ + --lr-warmup-iters 5000 \ + --lr-decay-iters 10000 \ + --ffn-hidden-size 11008 \ + --lr-decay-style cosine \ + --data-impl mmap \ + --log-interval 1 \ + --eval-iters ${EVAL_ITERS} \ + --eval-interval ${EVAL_INTERVAL} \ + --save-interval ${SAVE_INTERVAL} \ + --split 90,5,5 \ + --$DTYPE \ + $ds_args \ + --no-masked-softmax-fusion \ + --no-bias-gelu-fusion \ + --no-bias-dropout-fusion \ + --no-gradient-accumulation-fusion \ + --distributed-backend $NCCL \ + --tokenizer-type Llama2Tokenizer \ + --save checkpoints/${OUTPUT_PREFIX} \ + --load checkpoints/${OUTPUT_PREFIX} \ + --use-checkpoint-opt_param-scheduler \ + --accumulate-allreduce-grads-in-fp32 \ + --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ + --data-file-list ${DATA_FILE_LIST} \ + --num-workers 4 \ + ${LLAMA_ARGS} \ + ${EXTRA_ARGS} \ + ${gpt_args[*]} \ + $custom_args \ + |& tee $OUTPUT_DIR/output.log + " + +echo "Using $(which deepspeed)" +ds_report + +echo ${run_cmd} +eval ${run_cmd} +set +x From 346ddc38ea9e15b1703503eb9798c4c7e1c07a6e Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 21 Feb 2024 15:50:38 -0600 Subject: [PATCH 067/268] Update `train_llama_alcf_polaris.sh` --- train_llama_alcf_polaris.sh | 124 +++++++++++++++++++++++++++++++----- 1 file changed, 107 insertions(+), 17 deletions(-) diff --git a/train_llama_alcf_polaris.sh b/train_llama_alcf_polaris.sh index a06bfcd194..049d854f71 100644 --- a/train_llama_alcf_polaris.sh +++ b/train_llama_alcf_polaris.sh @@ -5,21 +5,59 @@ #PBS -l select=48 #PBS -l filesystems=eagle:home +ezpz() { + if [[ ! -d ezpz ]]; then + git clone https://github.com/saforem2/ezpz + else + echo "Found ezpz!" + fi + echo "Using :snake: $(which python3) to install \`ezpz\`:" + mkdir -p logs + python3 -m pip install -e ezpz > ezpz-install.log 2>&1 + source ezpz/src/ezpz/bin/savejobenv > /tmp/savejobenv.log 2>&1 || exit + source ezpz/src/ezpz/bin/getjobenv || exit +} + +makeHostfiles() { + GPUS_PER_NODE=$(python3 -Wignore -c 'import ezpz; print(ezpz.get_gpus_per_node())') + export GPUS_PER_NODE="${GPUS_PER_NODE}" + # ---- Make MPICH hostfile ---------------- + export hostfile_mpich=hostfile_mpich + cat "$PBS_NODEFILE" > "${hostfile_mpich}" + # ---- Make DeepSpeed hostfile ------------------- + export hostfile_deepspeed=hostfile_deepspeed + cat "$PBS_NODEFILE" > "${hostfile_deepspeed}" + sed -e "s/$/ slots=${GPUS_PER_NODE}/" -i "${hostfile_deepspeed}" + { + echo "PATH=${PATH}" ; + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" ; + echo "http_proxy=${http_proxy}" ; + echo "https_pro]xy=${https_proxy}" ; + echo "CFLAGS=${CFLAGS}" ; + echo "PYTHONUSERBASE=$PYTHONUSERBASE" ; + } > .deepspeed_env + # echo "PATH=${PATH}" > .deepspeed_env + # echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> .deepspeed_env + # echo "http_proxy=${http_proxy}" >> .deepspeed_env + # echo "https_proxy=${https_proxy}" >> .deepspeed_env + # echo "CFLAGS=${CFLAGS}" >> .d eepspeed_env + # echo "PYTHONUSERBASE=$PYTHONUSERBASE" >> .deepspeed_env + # ------------------------------------------------- +} + + +# ==== SCRIPT START ======================================================== cd "${PBS_O_WORKDIR}" || exit module load conda/2023-10-04; conda activate base -if [[ ! -d ezpz ]]; then - git clone https://github.com/saforem2/ezpz -else - echo "Found ezpz!" -fi -source ezpz/src/ezpz/bin/savejobenv > /tmp/savejobenv.log 2>&1 || exit -source ezpz/src/ezpz/bin/getjobenv || exit +ezpz +makeHostfiles # ---- Parallelism Settings ---- export PP=${PP:-1} export TP=${TP:-2} # ------------------------------ +export HERE=$(python3 -c 'import os; print(os.getcwd())') HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} @@ -34,7 +72,7 @@ export NUM_KV_HEAD=${NUM_KV_HEAD:-8} export LR=${LR:-0.00015} export SEQ=${SEQ:-4096} # SEQ_LEN: 4096 export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 -export ZERO_STAGE=${ZERO_STAGE:-2} +export ZERO_STAGE=${ZERO_STAGE:-1} export MICRO_BATCH=${MICRO_BATCH:-8} export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) @@ -47,24 +85,76 @@ export EVAL_INTERVAL=${EVAL_INTERVAL:-50000} export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} # export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} +export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" +export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" # export DATA_FILE_LIST="/lus/eagle/projects/datasets/dolma/chunks/40/data_file_list_chunk_0_of_40.txt" # export DATA_FILE_LIST="/lus/eagle/projects/datasets/dolma/chunks/10/data_file_list_chunk_0_of_10.txt" -export DATA_FILE_LIST="./dolma_data_file_list-00-of-04.txt" +# + +# export DOLMA_CHUNK_00_of_10="./chunks/10/data_file_list_chunk_00_of_10.txt" # 762 documents (lines) +# export DOLMA_CHUNK_01_of_10="./chunks/10/data_file_list_chunk_01_of_10.txt" # 722 +# export DOLMA_CHUNK_02_of_10="./chunks/10/data_file_list_chunk_02_of_10.txt" # 727 +# export DOLMA_CHUNK_03_of_10="./chunks/10/data_file_list_chunk_03_of_10.txt" # 707 +# export DOLMA_CHUNK_04_of_10="./chunks/10/data_file_list_chunk_04_of_10.txt" # 744 +# export DOLMA_CHUNK_05_of_10="./chunks/10/data_file_list_chunk_05_of_10.txt" # 766 +# export DOLMA_CHUNK_06_of_10="./chunks/10/data_file_list_chunk_06_of_10.txt" # 730 +# export DOLMA_CHUNK_07_of_10="./chunks/10/data_file_list_chunk_07_of_10.txt" # 759 +# export DOLMA_CHUNK_08_of_10="./chunks/10/data_file_list_chunk_08_of_10.txt" # 777 +# export DOLMA_CHUNK_09_of_10="./chunks/10/data_file_list_chunk_09_of_10.txt" # 752 + +# +# export DOLMA_CHUNK_00_of_04="./dolma_data_file_list-00-of-04.txt" # 1860 documents (lines) +# export DOLMA_CHUNK_01_of_04="./dolma_data_file_list-01-of-04.txt" # 1860 documents (lines) +# export DOLMA_CHUNK_02_of_04="./dolma_data_file_list-02-of-04.txt" # 1860 documents (lines) +# export DOLMA_CHUNK_03_of_04="./dolma_data_file_list-03-of-04.txt" # 1860 documents (lines) +# export DOLMA_CHUNK_04_of_04="./dolma_data_file_list-04-of-04.txt" # 6 documents (lines) + + + +# if [[ -n "$DEBUG_RUN" ]]; then +# # echo "Using LAST DOLMA CHUNK {09 / 10} with ${NDOCS} documents..." +# export DATA_FILE_LIST=${DATA_FILE_LIST:-${DOLMA_CHUNK_09_of_10}} +# # export ndocs=$(wc -l < "${DATA_FILE_LIST}") +# else +# # export fname="./chunks/10/data_file_list_chunk_${DOLMA_CHUNK_IDX}_of_10.txt" +# # export fname="${DOLMA_CHUNK_!{DOLMA_CHUNK_IDX}_of_10}" +# # export DATA_FILE_LIST="${DATA_FILE_LIST:-${DOLMA_CHUNK_00_of_10}}" +# fi +# export ndocs +# export DATA_FILE_LIST="${DATA_FILE_LIST}" +# export DATA_FILE_LIST="./dolma_data_file_list-00-of-04.txt" # export DATA_FILE_LIST="./dolma-chunk-00-of-40.txt" +# +# bash "${HERE}/generate_config.sh" "${DS_CONFIG}" || exit -export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" +# export DATA_CACHE_PATH="${DATA_CACHE_PATH}" +# if [[ -z "$DATA_CACHE_PATH" ]]; then +# echo "Not using DATA_CACHE_PATH !!" +# else +# echo "Using DATA_CACHE_PATH: ${DATA_CACHE_PATH}" +# fi -export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" export EXTRA_ARGS="--use-flash-attn-v2 --num-key-value-heads ${NUM_KV_HEAD}" -# export DATA_CACHE_PATH="${DATA_CACHE_PATH}" -if [[ -n "$DATA_CACHE_PATH" ]]; then - echo "Using DATA_CACHE_PATH: ${DATA_CACHE_PATH}" - EXTRA_ARGS="${EXTRA_ARGS} --data-cache-path ${DATA_CACHE_PATH}" +# export DATA_FILE_LIST="./chunks/10/data_file_list_chunk_${DOLMA_CHUNK_IDX}_of_10.txt" +export DOLMA_CHUNK_IDX="${DOLMA_CHUNK_IDX:-0}" +export DATA_FILE_LIST="/lus/eagle/projects/datasets/dolma/chunks/data_file_list_chunk_${DOLMA_CHUNK_IDX}_of_20.txt" +# export DATA_FILE_LIST="./dolma_data_file_list-${DOLMA_CHUNK_IDX}-of-04.txt" + +NDOCS=$(wc -l < "${DATA_FILE_LIST}") +echo "Using DOLMA CHUNK ${DOLMA_CHUNK_IDX} from ${DATA_FILE_LIST} with ${NDOCS} documents..." +export NDOCS="${NDOCS}" + +if [[ -z "${DATA_CACHE_PATH}" ]]; then + data_file_list_stem=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") + export DATA_CACHE_PATH="${HERE}/.cache/${data_file_list_stem}-index-cache" else - echo "Not using DATA_CACHE_PATH !!" + export DATA_CACHE_PATH="${DATA_CACHE_PATH}" + echo "CAUGHT DATA_CACHE_PATH: ${DATA_CACHE_PATH} from env !!" fi +mkdir -p "${DATA_CACHE_PATH}" + echo "++++++++++++++++++++++++++++++++++++++++++++++++++" echo "- WORLD_SIZE:${WORLD_SIZE}" echo "- NCCL: ${NCCL:-nccl}" @@ -72,7 +162,6 @@ echo "- MODEL_TYPE: ${MODEL_TYPE}" echo "- Using DATA_FILE_LIST: ${DATA_FILE_LIST}" echo "++++++++++++++++++++++++++++++++++++++++++++++++++" - # bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh $@ DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" @@ -177,6 +266,7 @@ run_cmd=" --accumulate-allreduce-grads-in-fp32 \ --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ --data-file-list ${DATA_FILE_LIST} \ + --data-cache-path ${DATA_CACHE_PATH} \ --num-workers 4 \ ${LLAMA_ARGS} \ ${EXTRA_ARGS} \ From bf536554cae05ee8929147c0fc276977f2760de0 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 21 Feb 2024 20:57:38 -0600 Subject: [PATCH 068/268] Remove `train_llama_alcf_chunk_00-of-20.sh` --- train_llama_alcf_chunk_00-of-20.sh | 60 ------------------------------ 1 file changed, 60 deletions(-) delete mode 100755 train_llama_alcf_chunk_00-of-20.sh diff --git a/train_llama_alcf_chunk_00-of-20.sh b/train_llama_alcf_chunk_00-of-20.sh deleted file mode 100755 index 46a00aea52..0000000000 --- a/train_llama_alcf_chunk_00-of-20.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash --login -#PBS -l walltime=06:00:00 -#PBS -A argonne_tpc -#PBS -q prod -#PBS -l select=48 -#PBS -l filesystems=eagle:home - -cd "${PBS_O_WORKDIR}" || exit -module load conda/2023-10-04; conda activate base -if [[ ! -d ezpz ]]; then - git clone https://github.com/saforem2/ezpz -else - echo "Found ezpz!" -fi -source ezpz/src/ezpz/bin/savejobenv || exit -source ezpz/src/ezpz/bin/getjobenv || exit - -export PP=1 -export TP=2 - -export HEADS=32 -export NLAYERS=32 -export HIDDEN=4096 -export NUM_KV_HEAD=8 - -export ZERO_STAGE=2 -export MICRO_BATCH=8 -export GRAD_ACC_STEPS=1 -export SEQ=4096 -export DTYPE=fp16 - -export EVAL_ITERS=20 -export TRAIN_ITER=317892 -export SAVE_INTERVAL=200 -# export EVAL_INTERVAL=1000 - -export DATA_PATH="/eagle/datasets/dolma/data_Llama2Tokenizer/wiki-en-simple/" -export DATA_FILE_LIST="/lus/eagle/projects/datasets/dolma/chunks/20/data_file_list_chunk_0_of_20.txt" -# export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_select_5.txt" -# export DATA_FILE_LIST="/eagle/datasets/dolma/chunks/40/data_file_list_chunk_0_of_40.txt" -# export DATA_FILE_LIST="/eagle/datasets/dolma/chunks/data_file_list_chunk_0_of_20.txt" - -export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" - -export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" -export USE_ACTIVATION_CHECKPOINTING=1 ; -export EXTRA_ARGS="--use-flash-attn-v2 --num-key-value-heads ${NUM_KV_HEAD}" - -echo "++++++++++++++++++++++++++++++++++++++++++++++++++" -echo "- WORLD_SIZE:${WORLD_SIZE}" -echo "- NCCL: ${NCCL:-nccl}" -echo "- MODEL_TYPE: ${MODEL_TYPE}" -echo "- Using DATA_FILE_LIST: ${DATA_FILE_LIST}" -echo "++++++++++++++++++++++++++++++++++++++++++++++++++" - - # [ -n "${MODEL_TYPE}" ] && - EXEC="./set_params.sh" - OUTPUT="train-${MODEL_TYPE}-mbs-${MICRO_BATCH}-zs${ZERO_STAGE}-kvh${NUM_KV_HEAD}-$(tstamp).log" - - [ -f "${EXEC}" ] && bash "${EXEC}" "${LLAMA_ARGS}" "${EXTRA_ARGS}" |& tee "${OUTPUT}" From 28ba58faecae9bdd12f4f6363cec615370ff14d0 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 21 Feb 2024 20:57:50 -0600 Subject: [PATCH 069/268] Add `train_llama_alcf_polaris.sh` --- train_llama_alcf_polaris.sh | 69 +++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/train_llama_alcf_polaris.sh b/train_llama_alcf_polaris.sh index 049d854f71..3143f2d69f 100644 --- a/train_llama_alcf_polaris.sh +++ b/train_llama_alcf_polaris.sh @@ -11,7 +11,7 @@ ezpz() { else echo "Found ezpz!" fi - echo "Using :snake: $(which python3) to install \`ezpz\`:" + echo "Using $(which python3) to install \`ezpz\`:" mkdir -p logs python3 -m pip install -e ezpz > ezpz-install.log 2>&1 source ezpz/src/ezpz/bin/savejobenv > /tmp/savejobenv.log 2>&1 || exit @@ -45,19 +45,39 @@ makeHostfiles() { # ------------------------------------------------- } +setupData() { + local cidx=$1 + # export DOLMA_CHUNK_IDX="${DOLMA_CHUNK_IDX:-0}" + # HERE=$(python3 -c 'import os; print(os.getcwd())') + export DATA_FILE_LIST="/eagle/datasets/dolma/chunks/4/data_file_list_chunk_${cidx}_of_4.txt" + NDOCS=$(wc -l < "${DATA_FILE_LIST}") + export NDOCS="${NDOCS}" + if [[ -z "${DATA_CACHE_PATH}" ]]; then + data_file_list_stem=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") + export DATA_CACHE_PATH=".cache/${data_file_list_stem}-index-cache" + else + export DATA_CACHE_PATH="${DATA_CACHE_PATH}" + echo "CAUGHT DATA_CACHE_PATH: ${DATA_CACHE_PATH} from env !!" + fi + export DOLMA_CHUNK_IDX="${cidx}" + mkdir -p "${DATA_CACHE_PATH}" +} + # ==== SCRIPT START ======================================================== cd "${PBS_O_WORKDIR}" || exit module load conda/2023-10-04; conda activate base ezpz makeHostfiles +setupData "${DOLMA_CHUNK_IDX:-0}" +echo "Using DOLMA CHUNK ${DOLMA_CHUNK_IDX} from ${DATA_FILE_LIST} with ${NDOCS} documents..." # ---- Parallelism Settings ---- export PP=${PP:-1} export TP=${TP:-2} # ------------------------------ -export HERE=$(python3 -c 'import os; print(os.getcwd())') +HERE=$(python3 -c 'import os; print(os.getcwd())') HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} @@ -78,10 +98,21 @@ export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) # --------------------------------------------- -export EVAL_ITERS=${EVAL_ITERS:-20} -export TRAIN_ITER=${TRAIN_ITER:-317892} +if [[ "${DOLMA_CHUNK_IDX}" == 0 ]]; then + TRAIN_ITER=78739 +elif [[ "${DOLMA_CHUNK_IDX}" == 1 ]]; then + TRAIN_ITER=81008 +elif [[ "${DOLMA_CHUNK_IDX}" == 2 ]]; then + TRAIN_ITER=79591 +elif [[ "${DOLMA_CHUNK_IDX}" == 3 ]]; then + TRAIN_ITER=78552 +else + echo "Unknown DOLMA_CHUNK_IDX: ${DOLMA_CHUNK_IDX}" + exit +fi +# export TRAIN_ITER=${TRAIN_ITER:-317892} + export SAVE_INTERVAL=${SAVE_INTERVAL:-200} -export EVAL_INTERVAL=${EVAL_INTERVAL:-50000} export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} # export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} @@ -134,27 +165,13 @@ export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings # echo "Using DATA_CACHE_PATH: ${DATA_CACHE_PATH}" # fi + export EXTRA_ARGS="--use-flash-attn-v2 --num-key-value-heads ${NUM_KV_HEAD}" # export DATA_FILE_LIST="./chunks/10/data_file_list_chunk_${DOLMA_CHUNK_IDX}_of_10.txt" -export DOLMA_CHUNK_IDX="${DOLMA_CHUNK_IDX:-0}" -export DATA_FILE_LIST="/lus/eagle/projects/datasets/dolma/chunks/data_file_list_chunk_${DOLMA_CHUNK_IDX}_of_20.txt" +# export DATA_FILE_LIST="/lus/eagle/projects/datasets/dolma/chunks/data_file_list_chunk_${DOLMA_CHUNK_IDX}_of_20.txt" # export DATA_FILE_LIST="./dolma_data_file_list-${DOLMA_CHUNK_IDX}-of-04.txt" -NDOCS=$(wc -l < "${DATA_FILE_LIST}") -echo "Using DOLMA CHUNK ${DOLMA_CHUNK_IDX} from ${DATA_FILE_LIST} with ${NDOCS} documents..." -export NDOCS="${NDOCS}" - -if [[ -z "${DATA_CACHE_PATH}" ]]; then - data_file_list_stem=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") - export DATA_CACHE_PATH="${HERE}/.cache/${data_file_list_stem}-index-cache" -else - export DATA_CACHE_PATH="${DATA_CACHE_PATH}" - echo "CAUGHT DATA_CACHE_PATH: ${DATA_CACHE_PATH} from env !!" -fi - -mkdir -p "${DATA_CACHE_PATH}" - echo "++++++++++++++++++++++++++++++++++++++++++++++++++" echo "- WORLD_SIZE:${WORLD_SIZE}" echo "- NCCL: ${NCCL:-nccl}" @@ -229,12 +246,17 @@ EXEC="./pretrain_gpt_alcf.py" # --merge-file $MERGE_FILE \ # --lr-decay-iters 320000 \ # --num-workers 0 \ + # --eval-iters ${EVAL_ITERS} \ + # --eval-interval ${EVAL_INTERVAL} \ + # --lr-warmup-iters 5000 \ + # --lr-decay-iters 10000 \ run_cmd=" deepspeed $launcher ${EXEC} \ --tensor-model-parallel-size $TP \ --pipeline-model-parallel-size $PP \ --num-layers $NLAYERS \ --hidden-size $HIDDEN \ + --ffn-hidden-size 11008 \ --num-attention-heads $HEADS \ --seq-length $SEQ \ --max-position-embeddings $SEQ \ @@ -242,14 +264,9 @@ run_cmd=" --global-batch-size $GLOBAL_BATCH \ --train-iters $TRAIN_ITER \ --lr ${LR} \ - --lr-warmup-iters 5000 \ - --lr-decay-iters 10000 \ - --ffn-hidden-size 11008 \ --lr-decay-style cosine \ --data-impl mmap \ --log-interval 1 \ - --eval-iters ${EVAL_ITERS} \ - --eval-interval ${EVAL_INTERVAL} \ --save-interval ${SAVE_INTERVAL} \ --split 90,5,5 \ --$DTYPE \ From 7f9af8dd5051838761aa2fe1cdaf37054cb291ab Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 23 Feb 2024 08:19:54 -0600 Subject: [PATCH 070/268] Update `train_llama_alcf_polaris.sh` --- train_llama_alcf_polaris.sh | 177 +++++++++++++++++++++++++----------- 1 file changed, 125 insertions(+), 52 deletions(-) diff --git a/train_llama_alcf_polaris.sh b/train_llama_alcf_polaris.sh index 3143f2d69f..904dc5d6fd 100644 --- a/train_llama_alcf_polaris.sh +++ b/train_llama_alcf_polaris.sh @@ -4,6 +4,42 @@ #PBS -q prod #PBS -l select=48 #PBS -l filesystems=eagle:home +# +set +x +loadCondaEnv() { + if [[ "${CONDA_EXE}" ]]; then + echo "Already inside ${CONDA_EXE}, exiting!" + else + MODULE_STR="$1" + module load "conda/${MODULE_STR}" + conda activate base + fi +} + +setupVenv() { + VENV_DIR="$1" + if [[ -d "${VENV_DIR}" ]]; then + echo "Found venv at: ${VENV_DIR}" + source "${VENV_DIR}/bin/activate" + else + echo "Skipping setupVenv() on $(hostname)" + fi +} + +setupPython() { + local conda_date=$1 + local venv_path=$2 + if [[ "${CONDA_EXE}" ]]; then + echo "Caught CONDA_EXE: ${CONDA_EXE}" + else + loadCondaEnv "${conda_date}" + fi + if [[ "${VIRTUAL_ENV}" ]]; then + echo "Caught VIRTUAL_ENV: ${VIRTUAL_ENV}" + else + setupVenv "${venv_path}" + fi +} ezpz() { if [[ ! -d ezpz ]]; then @@ -11,9 +47,13 @@ ezpz() { else echo "Found ezpz!" fi - echo "Using $(which python3) to install \`ezpz\`:" - mkdir -p logs - python3 -m pip install -e ezpz > ezpz-install.log 2>&1 + if python3 -c 'import ezpz; print(ezpz.__file__)' 2> '/dev/null'; then + echo "Has ezpz installed. Nothing to do." + else + echo "Does not have ezpz installed. Installing..." + echo "Using $(which python3) to install \`ezpz\`:" + python3 -m pip install -e ezpz > ezpz-install.log 2>&1 + fi source ezpz/src/ezpz/bin/savejobenv > /tmp/savejobenv.log 2>&1 || exit source ezpz/src/ezpz/bin/getjobenv || exit } @@ -46,38 +86,68 @@ makeHostfiles() { } setupData() { - local cidx=$1 + cidx=$1 + echo "Caught DOLMA_CHUNK_IDX: ${cidx} !!" # export DOLMA_CHUNK_IDX="${DOLMA_CHUNK_IDX:-0}" # HERE=$(python3 -c 'import os; print(os.getcwd())') - export DATA_FILE_LIST="/eagle/datasets/dolma/chunks/4/data_file_list_chunk_${cidx}_of_4.txt" - NDOCS=$(wc -l < "${DATA_FILE_LIST}") + # export DATA_FILE_LIST="/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/chunks/4/data_file_list_chunk_${cidx}_of_4.txt" + # export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_select.txt" + # export DATA_FILE_LIST="/eagle/datasets/dolma/chunks/4/data_file_list_chunk_${cidx}_of_4.txt" + # export DATA_FILE_LIST="./chunks/4/data_file_list_chunk_${cidx}_of_4.txt" + # export DATA_FILE_LIST="./chunks/4/data_file_list_chunk_${cidx}_of_4.txt" + # export DATA_FILE_LIST="./dolma_data_file_list-${cidx}-of-4.txt" + export DATA_FILE_LIST="${DATA_FILE_LIST:-"./dolma-shuf-chunk-${cidx}-of-4.txt"}" + echo "Using DATA_FILE_LIST: ${DATA_FILE_LIST}" + # [ -f "$DATA_FILE_LIST" ] || exit + NDOCS=$(wc -l < "${DATA_FILE_LIST}") && export NDOCS="${NDOCS}" export NDOCS="${NDOCS}" - if [[ -z "${DATA_CACHE_PATH}" ]]; then - data_file_list_stem=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") - export DATA_CACHE_PATH=".cache/${data_file_list_stem}-index-cache" - else - export DATA_CACHE_PATH="${DATA_CACHE_PATH}" - echo "CAUGHT DATA_CACHE_PATH: ${DATA_CACHE_PATH} from env !!" - fi + # if [[ -z "${DATA_CACHE_PATH}" ]]; then + # else + # echo "CAUGHT DATA_CACHE_PATH: ${DATA_CACHE_PATH} from env !!" + # DATA_CACHE_PATH="${DATA_CACHE_PATH}" + # fi + data_file_list_stem=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") export DOLMA_CHUNK_IDX="${cidx}" + export DATA_FILE_LIST_STEM="${data_file_list_stem}" + export DATA_CACHE_PATH=".cache/${data_file_list_stem}/index-cache" mkdir -p "${DATA_CACHE_PATH}" } # ==== SCRIPT START ======================================================== -cd "${PBS_O_WORKDIR}" || exit +# cd "${PBS_O_WORKDIR}" || exit +HERE=$(python3 -c 'import os; print(os.getcwd())') +# if [[ -z "${CONDA_EXE}" ]]; then module load conda/2023-10-04; conda activate base +# else +# echo "Caught CONDA_EXE = ${CONDA_EXE} from env" +# fi + +# if [[ -z "${VIRTUAL_ENV}" ]]; then +# source /home/foremans/polaris/projects/argonne-lcf/Megatron-DeepSpeed/venvs/polaris/2023-10-04/bin/activate || exit +# source ~/venvs/polaris/2023-10-04/bin/activate || exit +# else +# echo "Caught VIRTUAL_ENV = ${VIRTUAL_ENV} from env" +# fi +# if [[ "${VIRTUAL_ENV}" ]]; then +# echo "Caught virtual env from environment, using ${VIRTUAL_ENV}" +# else +echo "Using $(which python3)" + ezpz makeHostfiles setupData "${DOLMA_CHUNK_IDX:-0}" +# NDOCS=$(wc -l < "${DATA_FILE_LIST}") && export NDOCS="${NDOCS}" +# export NDOCS="${NDOCS}" echo "Using DOLMA CHUNK ${DOLMA_CHUNK_IDX} from ${DATA_FILE_LIST} with ${NDOCS} documents..." # ---- Parallelism Settings ---- -export PP=${PP:-1} -export TP=${TP:-2} +PP=${PP:-1} +TP=${TP:-2} +export PP="${PP}" +export TP="${TP}" # ------------------------------ -HERE=$(python3 -c 'import os; print(os.getcwd())') HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} @@ -92,11 +162,12 @@ export NUM_KV_HEAD=${NUM_KV_HEAD:-8} export LR=${LR:-0.00015} export SEQ=${SEQ:-4096} # SEQ_LEN: 4096 export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 -export ZERO_STAGE=${ZERO_STAGE:-1} +export ZERO_STAGE=${ZERO_STAGE:-2} export MICRO_BATCH=${MICRO_BATCH:-8} export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) # --------------------------------------------- +export TOKENIZER_MODEL="${TOKENIZER_MODEL:-"/eagle/datasets/dolma/utils/tokenizer.model"}" if [[ "${DOLMA_CHUNK_IDX}" == 0 ]]; then TRAIN_ITER=78739 @@ -107,9 +178,14 @@ elif [[ "${DOLMA_CHUNK_IDX}" == 2 ]]; then elif [[ "${DOLMA_CHUNK_IDX}" == 3 ]]; then TRAIN_ITER=78552 else - echo "Unknown DOLMA_CHUNK_IDX: ${DOLMA_CHUNK_IDX}" - exit + echo "caught DOLMA_CHUNK_IDX=${DOLMA_CHUNK_IDX}" + TRAIN_ITER="${TRAIN_ITER:-320000}" + echo "Setting TRAIN_ITER=${TRAIN_ITER}" + # echo "Unknown DOLMA_CHUNK_IDX: ${DOLMA_CHUNK_IDX}" fi + +export EVAL_ITERS="${EVAL_ITERS:-10}" +export EVAL_INTERVAL="${EVAL_INTERVAL:-50000}" # export TRAIN_ITER=${TRAIN_ITER:-317892} export SAVE_INTERVAL=${SAVE_INTERVAL:-200} @@ -166,7 +242,7 @@ export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings # fi -export EXTRA_ARGS="--use-flash-attn-v2 --num-key-value-heads ${NUM_KV_HEAD}" +# export EXTRA_ARGS="--use-flash-attn-v2 --num-key-value-heads ${NUM_KV_HEAD}" # export DATA_FILE_LIST="./chunks/10/data_file_list_chunk_${DOLMA_CHUNK_IDX}_of_10.txt" # export DATA_FILE_LIST="/lus/eagle/projects/datasets/dolma/chunks/data_file_list_chunk_${DOLMA_CHUNK_IDX}_of_20.txt" @@ -182,23 +258,17 @@ echo "++++++++++++++++++++++++++++++++++++++++++++++++++" # bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh $@ DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" -bash ./generate_config.sh "${DS_CONFIG}" || exit 1 +bash "${HERE}/generate_config.sh" "${DS_CONFIG}" || exit 1 -OUTPUT_PREFIX="logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" +OUTPUT_PREFIX="${HERE}/logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" # OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME} OUTPUT_DIR="${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" mkdir -p "${OUTPUT_DIR}" echo "!!!Please see logs at ${OUTPUT_DIR}" -# Hostfile path -hostfile_deepspeed=./hostfile_deepspeed -hostfile_mpich=./hostfile_mpich -cat "$PBS_NODEFILE" > hostfile_mpich -cat "$PBS_NODEFILE" > hostfile_deepspeed ; sed -e 's/$/ slots=4/' -i hostfile_deepspeed - ds_args=" " ds_args=" --deepspeed ${ds_args}" -if [ "$PP" == 1 ]; then +if [[ $PP == 1 ]]; then ds_args=" --no-pipeline-parallel ${ds_args}" fi ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" @@ -237,7 +307,7 @@ else fi NCCL=${NCCL:-nccl} -EXEC="./pretrain_gpt_alcf.py" +EXEC="pretrain_gpt_alcf.py" # MODEL=LLAMA_7B # OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_tp${TP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} @@ -246,51 +316,54 @@ EXEC="./pretrain_gpt_alcf.py" # --merge-file $MERGE_FILE \ # --lr-decay-iters 320000 \ # --num-workers 0 \ - # --eval-iters ${EVAL_ITERS} \ - # --eval-interval ${EVAL_INTERVAL} \ # --lr-warmup-iters 5000 \ # --lr-decay-iters 10000 \ + # --num-workers 4 \ + # launch python3 ${EXEC} \ run_cmd=" deepspeed $launcher ${EXEC} \ - --tensor-model-parallel-size $TP \ - --pipeline-model-parallel-size $PP \ + --$DTYPE \ + --lr ${LR} \ + --log-interval 1 \ + --seq-length $SEQ \ --num-layers $NLAYERS \ --hidden-size $HIDDEN \ --ffn-hidden-size 11008 \ + --train-iters $TRAIN_ITER \ + --eval-iters ${EVAL_ITERS} \ + --distributed-backend $NCCL \ --num-attention-heads $HEADS \ - --seq-length $SEQ \ --max-position-embeddings $SEQ \ --micro-batch-size $MICRO_BATCH \ - --global-batch-size $GLOBAL_BATCH \ - --train-iters $TRAIN_ITER \ - --lr ${LR} \ - --lr-decay-style cosine \ - --data-impl mmap \ - --log-interval 1 \ --save-interval ${SAVE_INTERVAL} \ + --eval-interval ${EVAL_INTERVAL} \ + --tensor-model-parallel-size $TP \ + --global-batch-size $GLOBAL_BATCH \ + --pipeline-model-parallel-size $PP \ + --data-file-list ${DATA_FILE_LIST} \ + --load checkpoints/${OUTPUT_PREFIX} \ + --save checkpoints/${OUTPUT_PREFIX} \ + --data-cache-path ${DATA_CACHE_PATH} \ + --num-key-value-heads ${NUM_KV_HEAD} \ + --tokenizer-model ${TOKENIZER_MODEL} \ --split 90,5,5 \ - --$DTYPE \ - $ds_args \ + --data-impl mmap \ --no-masked-softmax-fusion \ --no-bias-gelu-fusion \ --no-bias-dropout-fusion \ --no-gradient-accumulation-fusion \ - --distributed-backend $NCCL \ + --use-flash-attn-v2 \ + --lr-decay-style cosine \ --tokenizer-type Llama2Tokenizer \ - --save checkpoints/${OUTPUT_PREFIX} \ - --load checkpoints/${OUTPUT_PREFIX} \ --use-checkpoint-opt_param-scheduler \ --accumulate-allreduce-grads-in-fp32 \ - --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ - --data-file-list ${DATA_FILE_LIST} \ - --data-cache-path ${DATA_CACHE_PATH} \ - --num-workers 4 \ + $ds_args \ ${LLAMA_ARGS} \ - ${EXTRA_ARGS} \ ${gpt_args[*]} \ $custom_args \ |& tee $OUTPUT_DIR/output.log " + # ${EXTRA_ARGS} \ echo "Using $(which deepspeed)" ds_report From 7b00ac4a1351d81befee1303bd89c2d143a522f7 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 23 Feb 2024 15:57:35 -0600 Subject: [PATCH 071/268] Update `train_llama_alcf_polaris.sh` --- train_llama_alcf_polaris.sh | 315 ++++++++++++------------------------ 1 file changed, 100 insertions(+), 215 deletions(-) diff --git a/train_llama_alcf_polaris.sh b/train_llama_alcf_polaris.sh index 904dc5d6fd..7f96082665 100644 --- a/train_llama_alcf_polaris.sh +++ b/train_llama_alcf_polaris.sh @@ -4,249 +4,85 @@ #PBS -q prod #PBS -l select=48 #PBS -l filesystems=eagle:home -# -set +x -loadCondaEnv() { - if [[ "${CONDA_EXE}" ]]; then - echo "Already inside ${CONDA_EXE}, exiting!" - else - MODULE_STR="$1" - module load "conda/${MODULE_STR}" - conda activate base - fi -} - -setupVenv() { - VENV_DIR="$1" - if [[ -d "${VENV_DIR}" ]]; then - echo "Found venv at: ${VENV_DIR}" - source "${VENV_DIR}/bin/activate" - else - echo "Skipping setupVenv() on $(hostname)" - fi -} -setupPython() { - local conda_date=$1 - local venv_path=$2 - if [[ "${CONDA_EXE}" ]]; then - echo "Caught CONDA_EXE: ${CONDA_EXE}" +function sourceFile() { + fp="$1" + echo "source-ing ${fp}" + if [[ -f "${fp}" ]]; then + # shellcheck source="${fp}" + source "${fp}" else - loadCondaEnv "${conda_date}" + echo "ERROR: UNABLE TO SOURCE ${fp}" fi - if [[ "${VIRTUAL_ENV}" ]]; then - echo "Caught VIRTUAL_ENV: ${VIRTUAL_ENV}" - else - setupVenv "${venv_path}" - fi -} - -ezpz() { - if [[ ! -d ezpz ]]; then - git clone https://github.com/saforem2/ezpz - else - echo "Found ezpz!" - fi - if python3 -c 'import ezpz; print(ezpz.__file__)' 2> '/dev/null'; then - echo "Has ezpz installed. Nothing to do." - else - echo "Does not have ezpz installed. Installing..." - echo "Using $(which python3) to install \`ezpz\`:" - python3 -m pip install -e ezpz > ezpz-install.log 2>&1 - fi - source ezpz/src/ezpz/bin/savejobenv > /tmp/savejobenv.log 2>&1 || exit - source ezpz/src/ezpz/bin/getjobenv || exit -} - -makeHostfiles() { - GPUS_PER_NODE=$(python3 -Wignore -c 'import ezpz; print(ezpz.get_gpus_per_node())') - export GPUS_PER_NODE="${GPUS_PER_NODE}" - # ---- Make MPICH hostfile ---------------- - export hostfile_mpich=hostfile_mpich - cat "$PBS_NODEFILE" > "${hostfile_mpich}" - # ---- Make DeepSpeed hostfile ------------------- - export hostfile_deepspeed=hostfile_deepspeed - cat "$PBS_NODEFILE" > "${hostfile_deepspeed}" - sed -e "s/$/ slots=${GPUS_PER_NODE}/" -i "${hostfile_deepspeed}" - { - echo "PATH=${PATH}" ; - echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" ; - echo "http_proxy=${http_proxy}" ; - echo "https_pro]xy=${https_proxy}" ; - echo "CFLAGS=${CFLAGS}" ; - echo "PYTHONUSERBASE=$PYTHONUSERBASE" ; - } > .deepspeed_env - # echo "PATH=${PATH}" > .deepspeed_env - # echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> .deepspeed_env - # echo "http_proxy=${http_proxy}" >> .deepspeed_env - # echo "https_proxy=${https_proxy}" >> .deepspeed_env - # echo "CFLAGS=${CFLAGS}" >> .d eepspeed_env - # echo "PYTHONUSERBASE=$PYTHONUSERBASE" >> .deepspeed_env - # ------------------------------------------------- -} - -setupData() { - cidx=$1 - echo "Caught DOLMA_CHUNK_IDX: ${cidx} !!" - # export DOLMA_CHUNK_IDX="${DOLMA_CHUNK_IDX:-0}" - # HERE=$(python3 -c 'import os; print(os.getcwd())') - # export DATA_FILE_LIST="/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/chunks/4/data_file_list_chunk_${cidx}_of_4.txt" - # export DATA_FILE_LIST="/eagle/datasets/dolma/data_file_list_select.txt" - # export DATA_FILE_LIST="/eagle/datasets/dolma/chunks/4/data_file_list_chunk_${cidx}_of_4.txt" - # export DATA_FILE_LIST="./chunks/4/data_file_list_chunk_${cidx}_of_4.txt" - # export DATA_FILE_LIST="./chunks/4/data_file_list_chunk_${cidx}_of_4.txt" - # export DATA_FILE_LIST="./dolma_data_file_list-${cidx}-of-4.txt" - export DATA_FILE_LIST="${DATA_FILE_LIST:-"./dolma-shuf-chunk-${cidx}-of-4.txt"}" - echo "Using DATA_FILE_LIST: ${DATA_FILE_LIST}" - # [ -f "$DATA_FILE_LIST" ] || exit - NDOCS=$(wc -l < "${DATA_FILE_LIST}") && export NDOCS="${NDOCS}" - export NDOCS="${NDOCS}" - # if [[ -z "${DATA_CACHE_PATH}" ]]; then - # else - # echo "CAUGHT DATA_CACHE_PATH: ${DATA_CACHE_PATH} from env !!" - # DATA_CACHE_PATH="${DATA_CACHE_PATH}" - # fi - data_file_list_stem=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") - export DOLMA_CHUNK_IDX="${cidx}" - export DATA_FILE_LIST_STEM="${data_file_list_stem}" - export DATA_CACHE_PATH=".cache/${data_file_list_stem}/index-cache" - mkdir -p "${DATA_CACHE_PATH}" } - -# ==== SCRIPT START ======================================================== -# cd "${PBS_O_WORKDIR}" || exit +# +++++++++++++++ SCRIPT START +++++++++++++++++++++++ +# ---- source ./helpers_alcf.sh --------------------- HERE=$(python3 -c 'import os; print(os.getcwd())') -# if [[ -z "${CONDA_EXE}" ]]; then -module load conda/2023-10-04; conda activate base -# else -# echo "Caught CONDA_EXE = ${CONDA_EXE} from env" -# fi +sourceFile "${HERE}/helpers_alcf.sh" || exit -# if [[ -z "${VIRTUAL_ENV}" ]]; then -# source /home/foremans/polaris/projects/argonne-lcf/Megatron-DeepSpeed/venvs/polaris/2023-10-04/bin/activate || exit -# source ~/venvs/polaris/2023-10-04/bin/activate || exit -# else -# echo "Caught VIRTUAL_ENV = ${VIRTUAL_ENV} from env" -# fi -# if [[ "${VIRTUAL_ENV}" ]]; then -# echo "Caught virtual env from environment, using ${VIRTUAL_ENV}" -# else +# ---- load conda ----------------------------------- +module load conda/2023-10-04; conda activate base echo "Using $(which python3)" +# ---- fns from ./helpers_alcf.sh ------------------- ezpz makeHostfiles -setupData "${DOLMA_CHUNK_IDX:-0}" -# NDOCS=$(wc -l < "${DATA_FILE_LIST}") && export NDOCS="${NDOCS}" -# export NDOCS="${NDOCS}" -echo "Using DOLMA CHUNK ${DOLMA_CHUNK_IDX} from ${DATA_FILE_LIST} with ${NDOCS} documents..." +saveDSenv +# setupData "${DOLMA_CHUNK_IDX:-00}" +# export DOLMA_CHUNK_IDX="${DOLMA_CHUNK_IDX:-0}" +# +# ---- DATA SETUP ------------------------------------ +DATA_FILE_LIST="./data_file_list_shuf_debug.txt" && export DATA_FILE_LIST="${DATA_FILE_LIST}" +NDOCS=$(wc -l < "${DATA_FILE_LIST}") && export NDOCS="${NDOCS}" +WEIGHT_SUM="$(sumWeights "${DATA_FILE_LIST}")" && export WEIGHT_SUM="${WEIGHT_SUM}" +DFL_STEM=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") && export DFL_STEM="${DFL_STEM}" +dcp="${HERE}/.cache/${DFL_STEM}-index-cache" +DATA_CACHE_PATH="${DATA_CACHE_PATH:-${dcp}}" && export DATA_CACHE_PATH="${DATA_CACHE_PATH}" +mkdir -p "${DATA_CACHE_PATH}" +if [[ -n "${DOLMA_CHUNK_IDX}" ]]; then + echo "Using DOLMA CHUNK ${DOLMA_CHUNK_IDX} from ${DATA_FILE_LIST} with ${NDOCS} documents..." +else + echo "Using NDOCS=${NDOCS} documents from DATA_FILE_LIST=${DATA_FILE_LIST}" +fi +echo "DOCUMENT WEIGHT_SUM: ${WEIGHT_SUM}" +# ---------------------------------------------------- -# ---- Parallelism Settings ---- +# ---- Parallelism Settings -------------------------- PP=${PP:-1} TP=${TP:-2} export PP="${PP}" export TP="${TP}" -# ------------------------------ - -HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" +export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} +# ---------------------------------------------------- -# ---- Llama2 7B Config ----------------------- +# ---- Llama2 7B Config ------------------------------ export HEADS=${HEADS:-32} export NLAYERS=${NLAYERS:-32} export HIDDEN=${HIDDEN:-4096} export NUM_KV_HEAD=${NUM_KV_HEAD:-8} -# --------------------------------------------- +# ---------------------------------------------------- -# ---- Run Settings ------------------------------------------ +# ---- Run Settings ---------------------------------- export LR=${LR:-0.00015} export SEQ=${SEQ:-4096} # SEQ_LEN: 4096 export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 export ZERO_STAGE=${ZERO_STAGE:-2} export MICRO_BATCH=${MICRO_BATCH:-8} export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} -export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) -# --------------------------------------------- export TOKENIZER_MODEL="${TOKENIZER_MODEL:-"/eagle/datasets/dolma/utils/tokenizer.model"}" - -if [[ "${DOLMA_CHUNK_IDX}" == 0 ]]; then - TRAIN_ITER=78739 -elif [[ "${DOLMA_CHUNK_IDX}" == 1 ]]; then - TRAIN_ITER=81008 -elif [[ "${DOLMA_CHUNK_IDX}" == 2 ]]; then - TRAIN_ITER=79591 -elif [[ "${DOLMA_CHUNK_IDX}" == 3 ]]; then - TRAIN_ITER=78552 -else - echo "caught DOLMA_CHUNK_IDX=${DOLMA_CHUNK_IDX}" - TRAIN_ITER="${TRAIN_ITER:-320000}" - echo "Setting TRAIN_ITER=${TRAIN_ITER}" - # echo "Unknown DOLMA_CHUNK_IDX: ${DOLMA_CHUNK_IDX}" -fi - +export TRAIN_ITER=${TRAIN_ITER:-317892} +# export TRAIN_ITER="${TRAIN_ITER:-320000}" export EVAL_ITERS="${EVAL_ITERS:-10}" export EVAL_INTERVAL="${EVAL_INTERVAL:-50000}" -# export TRAIN_ITER=${TRAIN_ITER:-317892} - export SAVE_INTERVAL=${SAVE_INTERVAL:-200} export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} # export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} - +export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" -# export DATA_FILE_LIST="/lus/eagle/projects/datasets/dolma/chunks/40/data_file_list_chunk_0_of_40.txt" -# export DATA_FILE_LIST="/lus/eagle/projects/datasets/dolma/chunks/10/data_file_list_chunk_0_of_10.txt" -# - -# export DOLMA_CHUNK_00_of_10="./chunks/10/data_file_list_chunk_00_of_10.txt" # 762 documents (lines) -# export DOLMA_CHUNK_01_of_10="./chunks/10/data_file_list_chunk_01_of_10.txt" # 722 -# export DOLMA_CHUNK_02_of_10="./chunks/10/data_file_list_chunk_02_of_10.txt" # 727 -# export DOLMA_CHUNK_03_of_10="./chunks/10/data_file_list_chunk_03_of_10.txt" # 707 -# export DOLMA_CHUNK_04_of_10="./chunks/10/data_file_list_chunk_04_of_10.txt" # 744 -# export DOLMA_CHUNK_05_of_10="./chunks/10/data_file_list_chunk_05_of_10.txt" # 766 -# export DOLMA_CHUNK_06_of_10="./chunks/10/data_file_list_chunk_06_of_10.txt" # 730 -# export DOLMA_CHUNK_07_of_10="./chunks/10/data_file_list_chunk_07_of_10.txt" # 759 -# export DOLMA_CHUNK_08_of_10="./chunks/10/data_file_list_chunk_08_of_10.txt" # 777 -# export DOLMA_CHUNK_09_of_10="./chunks/10/data_file_list_chunk_09_of_10.txt" # 752 - -# -# export DOLMA_CHUNK_00_of_04="./dolma_data_file_list-00-of-04.txt" # 1860 documents (lines) -# export DOLMA_CHUNK_01_of_04="./dolma_data_file_list-01-of-04.txt" # 1860 documents (lines) -# export DOLMA_CHUNK_02_of_04="./dolma_data_file_list-02-of-04.txt" # 1860 documents (lines) -# export DOLMA_CHUNK_03_of_04="./dolma_data_file_list-03-of-04.txt" # 1860 documents (lines) -# export DOLMA_CHUNK_04_of_04="./dolma_data_file_list-04-of-04.txt" # 6 documents (lines) - - - -# if [[ -n "$DEBUG_RUN" ]]; then -# # echo "Using LAST DOLMA CHUNK {09 / 10} with ${NDOCS} documents..." -# export DATA_FILE_LIST=${DATA_FILE_LIST:-${DOLMA_CHUNK_09_of_10}} -# # export ndocs=$(wc -l < "${DATA_FILE_LIST}") -# else -# # export fname="./chunks/10/data_file_list_chunk_${DOLMA_CHUNK_IDX}_of_10.txt" -# # export fname="${DOLMA_CHUNK_!{DOLMA_CHUNK_IDX}_of_10}" -# # export DATA_FILE_LIST="${DATA_FILE_LIST:-${DOLMA_CHUNK_00_of_10}}" -# fi -# export ndocs -# export DATA_FILE_LIST="${DATA_FILE_LIST}" -# export DATA_FILE_LIST="./dolma_data_file_list-00-of-04.txt" -# export DATA_FILE_LIST="./dolma-chunk-00-of-40.txt" -# -# bash "${HERE}/generate_config.sh" "${DS_CONFIG}" || exit - -# export DATA_CACHE_PATH="${DATA_CACHE_PATH}" -# if [[ -z "$DATA_CACHE_PATH" ]]; then -# echo "Not using DATA_CACHE_PATH !!" -# else -# echo "Using DATA_CACHE_PATH: ${DATA_CACHE_PATH}" -# fi - - -# export EXTRA_ARGS="--use-flash-attn-v2 --num-key-value-heads ${NUM_KV_HEAD}" - -# export DATA_FILE_LIST="./chunks/10/data_file_list_chunk_${DOLMA_CHUNK_IDX}_of_10.txt" -# export DATA_FILE_LIST="/lus/eagle/projects/datasets/dolma/chunks/data_file_list_chunk_${DOLMA_CHUNK_IDX}_of_20.txt" -# export DATA_FILE_LIST="./dolma_data_file_list-${DOLMA_CHUNK_IDX}-of-04.txt" +# ---------------------------------------------------- echo "++++++++++++++++++++++++++++++++++++++++++++++++++" echo "- WORLD_SIZE:${WORLD_SIZE}" @@ -255,17 +91,65 @@ echo "- MODEL_TYPE: ${MODEL_TYPE}" echo "- Using DATA_FILE_LIST: ${DATA_FILE_LIST}" echo "++++++++++++++++++++++++++++++++++++++++++++++++++" -# bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh $@ +# if [[ "${DOLMA_CHUNK_IDX}" == 0 ]]; then +# TRAIN_ITER=78739 +# elif [[ "${DOLMA_CHUNK_IDX}" == 1 ]]; then +# TRAIN_ITER=81008 +# elif [[ "${DOLMA_CHUNK_IDX}" == 2 ]]; then +# TRAIN_ITER=79591 +# elif [[ "${DOLMA_CHUNK_IDX}" == 3 ]]; then +# TRAIN_ITER=78552 +# else +# echo "caught DOLMA_CHUNK_IDX=${DOLMA_CHUNK_IDX}" +# TRAIN_ITER="${TRAIN_ITER:-320000}" +# echo "Setting TRAIN_ITER=${TRAIN_ITER}" +# # echo "Unknown DOLMA_CHUNK_IDX: ${DOLMA_CHUNK_IDX}" +# fi +# +++++NOTES ++++++++++++++++++++++++++++++++++++++++++++++++++ +# XXX: +# - need to merge *.json files +# - Can we create indices on a per-dataset basis? +# (i.e. one for common-crawl, one for stack-code, etc.) +# - Aggregate `stack-code/**/{*.bin,*.idx}` +# +# - Given: {f1.bin,f2.bin,...,fn.bin} +# - tot_tokens = 0 +# - agg = [] +# - Start: +# - read: f1.bin +# - tot_tokens += sum(tokens(f1.bin)) +# - if tot_tokens < needed_tokens: +# - agg.append(f1.bin) +# - else: +# + +# TODO: +# - StackExchange ~ 500B total, using 80% ~ 400B tokens +# - figure out how to deal with MANY small files (e.g. stack-code) +# - Add logic for determining `train_iters` dynamically from `data-file-list` +# (which specifies a single _chunk_) +# - get script from Varuni +# - should: +# - take in a `data_file_list.txt` +# - return number of training iterations +# +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +# ---- Build DeepSpeed Config --------------------------------- DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" bash "${HERE}/generate_config.sh" "${DS_CONFIG}" || exit 1 +# ------------------------------------------------------------- +# ---- Specify output location -------------------------------- OUTPUT_PREFIX="${HERE}/logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" # OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME} OUTPUT_DIR="${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" mkdir -p "${OUTPUT_DIR}" echo "!!!Please see logs at ${OUTPUT_DIR}" + +# ---- Setup DeepSpeed arguments -------------------------------- ds_args=" " ds_args=" --deepspeed ${ds_args}" if [[ $PP == 1 ]]; then @@ -280,9 +164,12 @@ if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then # --checkpoint-activations \ # --deepspeed-activation-checkpointing fi +# --------------------------------------------------------------- gpt_args=() +# we are now using activation checkpoint provided by megatron, see below. +# ds_args=" --deepspeed-activation-checkpointing ${ds_args}" if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" gpt_args+=( @@ -290,28 +177,26 @@ if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then "--checkpoint-num-layers 1" ) fi -# we are now using activation checkpoint provided by megatron, see below. -# ds_args=" --deepspeed-activation-checkpointing ${ds_args}" -# NUM_KV_HEADS="${NUM_KV_HEADS:-0}" -# if [[ $NUM_KV_HEADS -]] # take custom args custom_args=" $@" # launcher setting +hfds="${HERE}/hostfile_deepspeed" +hfmpi="${HERE}/hostfile_mpich" +[ -f "$hfds" ] || exit +[ -f "$hfmpi" ] || exit + LAUNCHER=${LAUNCHER:-MPICH} if [[ $LAUNCHER == "deepspeed" ]]; then launcher="" else - launcher="--force_multi --hostfile $hostfile_deepspeed --launcher=${LAUNCHER} --launcher_args='-hostfile ${hostfile_mpich}'" + launcher="--force_multi --hostfile $hfds --launcher=${LAUNCHER} --launcher_args='-hostfile ${hfmpi}'" fi NCCL=${NCCL:-nccl} EXEC="pretrain_gpt_alcf.py" -# MODEL=LLAMA_7B -# OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_tp${TP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} - # --vocab-file $VOCAB_FILE \ # --merge-file $MERGE_FILE \ # --lr-decay-iters 320000 \ From 9a3688f05f5d5588fd5bb98adfe3fba1c04aa192 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 23 Feb 2024 15:57:46 -0600 Subject: [PATCH 072/268] Add `helpers_alcf.sh` --- helpers_alcf.sh | 80 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 helpers_alcf.sh diff --git a/helpers_alcf.sh b/helpers_alcf.sh new file mode 100644 index 0000000000..b91000a67c --- /dev/null +++ b/helpers_alcf.sh @@ -0,0 +1,80 @@ +#!/bin/bash --login + +ezpz() { + if [[ ! -d ezpz ]]; then + git clone https://github.com/saforem2/ezpz + else + echo "Found ezpz!" + fi + if python3 -c 'import ezpz; print(ezpz.__file__)' 2> '/dev/null'; then + echo "Has ezpz installed. Nothing to do." + else + echo "Does not have ezpz installed. Installing..." + echo "Using $(which python3) to install \`ezpz\`:" + python3 -m pip install -e ezpz > ezpz-install.log 2>&1 + fi + source ezpz/src/ezpz/bin/savejobenv > /tmp/savejobenv.log 2>&1 || exit + source ezpz/src/ezpz/bin/getjobenv || exit +} + +saveDSenv() { + echo "Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env" + { + echo "PATH=${PATH}" ; + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" ; + echo "http_proxy=${http_proxy}" ; + echo "https_proxy=${https_proxy}" ; + echo "CFLAGS=${CFLAGS}" ; + echo "PYTHONUSERBASE=$PYTHONUSERBASE" ; + } > .deepspeed_env +} + +makeHostfiles() { + GPUS_PER_NODE=$(python3 -Wignore -c 'import ezpz; print(ezpz.get_gpus_per_node())') + export GPUS_PER_NODE="${GPUS_PER_NODE}" + # ---- Make MPICH hostfile ---------------- + export hostfile_mpich=hostfile_mpich + cat "$PBS_NODEFILE" > "${hostfile_mpich}" + # ---- Make DeepSpeed hostfile ------------------- + export hostfile_deepspeed=hostfile_deepspeed + cat "$PBS_NODEFILE" > "${hostfile_deepspeed}" + sed -e "s/$/ slots=${GPUS_PER_NODE}/" -i "${hostfile_deepspeed}" +} + +sumWeights() { + local file_list=$1 + weights=$(cat "${file_list}" | awk '{print $1}' | tr '\n' '\ ,\ ' | sed 's/^/[/g' | sed 's/$/]/g' | tr '\ ' "\,\ ") + # weights=$(echo "$weights" | tr ",]" "]") + # echo "weights: $weights" + python3 -c "import numpy as np; print(np.sum(${weights}))" +} + +sumFiles() { + local rd=$1 + for f in $("${rd}/*.txt"); do + ws=$(sumWeights "${rd}/${f}") + echo "sum($f.weights)=${ws}" + done +} + +setupData() { + cidx=$1 + echo "Caught DOLMA_CHUNK_IDX: ${cidx} !!" + dfl="./chunks-reweighted/10/data_file_list_chunk_${cidx}_of_10.txt" + if [[ -z "${DATA_FILE_LIST}" ]]; then + DATA_FILE_LIST="${dfl}" + else + echo "Caught DATA_FILE_LIST: ${DATA_FILE_LIST} from ENV!!" + fi + NDOCS=$(wc -l < "${DATA_FILE_LIST}") && export NDOCS="${NDOCS}" + WEIGHT_SUM="$(sumWeights "${DATA_FILE_LIST}")" + export WEIGHT_SUM="${WEIGHT_SUM}" + export NDOCS="${NDOCS}" + echo "Using DATA_FILE_LIST: ${DATA_FILE_LIST} with ${NDOCS} documents" + echo "WEIGHT SUM: ${WEIGHT_SUM}" + data_file_list_stem=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") + export DOLMA_CHUNK_IDX="${cidx}" + export DATA_FILE_LIST_STEM="${data_file_list_stem}" + export DATA_CACHE_PATH=".cache/${data_file_list_stem}/index-cache" + mkdir -p "${DATA_CACHE_PATH}" +} From 5cfa5569ec9672128b6227e66ee9a15146389dea Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Sat, 24 Feb 2024 16:59:39 -0600 Subject: [PATCH 073/268] removed unnecessary setup for master_port --- llama_alcf.sh | 7 ++++--- pretrain_gpt_alcf.py | 6 +++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/llama_alcf.sh b/llama_alcf.sh index dcfa41d109..0ca7cb78bb 100755 --- a/llama_alcf.sh +++ b/llama_alcf.sh @@ -2,7 +2,7 @@ #PBS -l walltime=0:30:00 #PBS -A datascience #PBS -q debug-scaling -#PBS -l select=6 +#PBS -l select=2 #PBS -l filesystems=eagle:grand:home export PPN=4 export MD=/home/hzheng/ALCF-Megatron-DeepSpeed @@ -20,7 +20,8 @@ export SP=$((PBS_JOBSIZE*PPN/PP/TP)) export export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") export DATA_PATH="/eagle/datasets//dolma/data_Llama2Tokenizer/wiki-en-simple/" -export DATA_FILE_LIST="/eagle/datasets//dolma//data_file_list_select.txt" +#export DATA_FILE_LIST="/eagle/datasets//dolma//data_file_list_select.txt" +DATA_FILE_LIST=$PWD/test.txt echo "BS: $BS\n PP:$PP \n TP: $TP, PBS_JOBSIZE: $PBS_JOBSIZE" HIDDEN_SIZE=4096 @@ -31,7 +32,7 @@ TRAIN_ITERS=10 ZERO_STAGE=2 MODEL=LLAMA_7B OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} -MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --hostfile $PBS_NODEFILE python3 ./pretrain_gpt_alcf.py \ +MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --cpu-bind depth -d 16 --hostfile $PBS_NODEFILE python3 ./pretrain_gpt_alcf.py \ --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ --num-layers ${NUM_LAYERS} \ diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 0f50ee2faa..a625875ab7 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -434,6 +434,7 @@ def git_ds_info(): def main(): # if RANK == 0: # setup_wandb() + ''' the following is import socket from mpi4py import MPI rank = MPI.COMM_WORLD.rank @@ -446,9 +447,8 @@ def main(): master_addr = MPI.COMM_WORLD.bcast(master_addr, root=0) os.environ["MASTER_ADDR"] = master_addr os.environ["MASTER_PORT"] = str(2345) - args = get_args() - - if (args.profile): + ''' + if os.getenv('TORCH_PROFILER_ENABLED') == '1': from torch.profiler import profile, record_function, ProfilerActivity with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: model = pretrain( From 43629e6a5b2285da9cd706d60bf45401ff826051 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sat, 24 Feb 2024 17:18:22 -0600 Subject: [PATCH 074/268] Update `helpers_alcf.sh` for Aurora --- helpers_alcf.sh | 60 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/helpers_alcf.sh b/helpers_alcf.sh index b91000a67c..c0f3e467f9 100644 --- a/helpers_alcf.sh +++ b/helpers_alcf.sh @@ -29,17 +29,6 @@ saveDSenv() { } > .deepspeed_env } -makeHostfiles() { - GPUS_PER_NODE=$(python3 -Wignore -c 'import ezpz; print(ezpz.get_gpus_per_node())') - export GPUS_PER_NODE="${GPUS_PER_NODE}" - # ---- Make MPICH hostfile ---------------- - export hostfile_mpich=hostfile_mpich - cat "$PBS_NODEFILE" > "${hostfile_mpich}" - # ---- Make DeepSpeed hostfile ------------------- - export hostfile_deepspeed=hostfile_deepspeed - cat "$PBS_NODEFILE" > "${hostfile_deepspeed}" - sed -e "s/$/ slots=${GPUS_PER_NODE}/" -i "${hostfile_deepspeed}" -} sumWeights() { local file_list=$1 @@ -78,3 +67,52 @@ setupData() { export DATA_CACHE_PATH=".cache/${data_file_list_stem}/index-cache" mkdir -p "${DATA_CACHE_PATH}" } + + + +setEnv() { + if [[ $(hostname) == x4* ]]; then + SETENV_FILE="${HOME}/anl_24_release_q4/llm.devkit/setenv.sh" + if [[ "${SETENV_FILE}" ]]; then + # shellcheck source=/home/foremans/anl_24_release_q4/llm.devkit/setenv.sh + source "${HOME}/anl_24_release_q4/llm.devkit/setenv.sh" + else + echo "Unable to source ${SETENV_FILE}, exiting!" + exit + fi + elif [[ $(hostname) == x3* ]]; then + # ---- load conda ----------------------------------- + module load conda/2023-10-04; conda activate base + if [[ "${VIRTUAL_ENV}" ]]; then + echo "Caught VIRTUAL_ENV = ${VIRTUAL_ENV} from environment!!" + else + echo "Not using VIRTUAL_ENV" + # sourceFile "${HERE}/venvs/polaris/2023-10-04/bin/activate" || exit + fi + else + echo "Unknown hostname $(hostname)" + exit 1 + fi +} + +makeHostfiles() { + GPUS_PER_NODE=$(python3 -Wignore -c 'import ezpz; print(ezpz.get_gpus_per_node())') + export GPUS_PER_NODE="${GPUS_PER_NODE}" + # ---- Make MPICH hostfile ---------------- + export hostfile_mpich=hostfile_mpich + cat "$PBS_NODEFILE" > "${hostfile_mpich}" + # ---- Make DeepSpeed hostfile ------------------- + export hostfile_deepspeed=hostfile_deepspeed + cat "$PBS_NODEFILE" > "${hostfile_deepspeed}" + sed -e "s/$/ slots=${GPUS_PER_NODE}/" -i "${hostfile_deepspeed}" +} + + +makeDSenv() { + echo "PATH=${PATH}" > .deepspeed_env + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> .deepspeed_env + echo "http_proxy=${http_proxy}" >> .deepspeed_env + echo "https_proxy=${https_proxy}" >> .deepspeed_env + echo "CFLAGS=${CFLAGS}" >> .deepspeed_env + echo "PYTHONUSERBASE=$PYTHONUSERBASE" >> .deepspeed_env +} From 5a3dae7b3136f4f0567be098535aa17892126c5a Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sat, 24 Feb 2024 17:20:01 -0600 Subject: [PATCH 075/268] Fix `unable to init args` in `pretrain_gpt_alcf.py` --- pretrain_gpt_alcf.py | 63 ++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 7a378c3a4f..cf9251f632 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -143,13 +143,14 @@ def model_provider(pre_process=True, post_process=True): print_rank_0(f"Number of parameters in model: {num_params}") print_rank_0(80 * '-') see_memory_usage("After Building Model", force=True) - if wandb.run is not None: - wandb.run.watch( - model, - log='all', - log_graph=True, - ) - wandb.run.config.update({'num_params': num_params}) + # if wandb.run is not None: + # if torch.cuda.is_available() + # wandb.run.watch( + # model, + # log='all', + # log_graph=True, + # ) + # wandb.run.config.update({'num_params': num_params}) return model def get_batch(data_iterator): @@ -446,30 +447,30 @@ def main(): master_addr = MPI.COMM_WORLD.bcast(master_addr, root=0) os.environ["MASTER_ADDR"] = master_addr os.environ["MASTER_PORT"] = str(2345) - args = get_args() - - if (args.profile): - from torch.profiler import profile, record_function, ProfilerActivity - with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: - model = pretrain( - train_valid_test_datasets_provider, - model_provider, - ModelType.encoder_or_decoder, - forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, - data_post_process=data_post_process - ) - - prof.export_chrome_trace(f"{args.tensorboard_dir}/torch-trace-{RANK}-of-{WORLD_SIZE}.json") - else: - model = pretrain( - train_valid_test_datasets_provider, - model_provider, - ModelType.encoder_or_decoder, - forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, - data_post_process=data_post_process - ) + # args = get_args() + # + # if (args.profile): + # from torch.profiler import profile, record_function, ProfilerActivity + # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: + # model = pretrain( + # train_valid_test_datasets_provider, + # model_provider, + # ModelType.encoder_or_decoder, + # forward_step, + # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + # data_post_process=data_post_process + # ) + # + # prof.export_chrome_trace(f"{args.tensorboard_dir}/torch-trace-{RANK}-of-{WORLD_SIZE}.json") + # else: + model = pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + data_post_process=data_post_process + ) # # from megatron.training import get_model # if wandb.run is not None: # args = get_args() From d44769cb9822aa24fba094c904c83326cd89fda7 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sat, 24 Feb 2024 19:54:05 -0600 Subject: [PATCH 076/268] Update `train_llama_alcf_polaris.sh` --- train_llama_alcf_polaris.sh | 96 ++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 39 deletions(-) diff --git a/train_llama_alcf_polaris.sh b/train_llama_alcf_polaris.sh index 7f96082665..96f6197637 100644 --- a/train_llama_alcf_polaris.sh +++ b/train_llama_alcf_polaris.sh @@ -16,13 +16,19 @@ function sourceFile() { fi } -# +++++++++++++++ SCRIPT START +++++++++++++++++++++++ +# +++++++++++++++ SCRIPT START ++++++++++++++++++++++ # ---- source ./helpers_alcf.sh --------------------- HERE=$(python3 -c 'import os; print(os.getcwd())') sourceFile "${HERE}/helpers_alcf.sh" || exit # ---- load conda ----------------------------------- module load conda/2023-10-04; conda activate base +if [[ "${VIRTUAL_ENV}" ]]; then + echo "Caught VIRTUAL_ENV = ${VIRTUAL_ENV} from environment!!" +else + echo "Not using VIRTUAL_ENV" + # sourceFile "${HERE}/venvs/polaris/2023-10-04/bin/activate" || exit +fi echo "Using $(which python3)" # ---- fns from ./helpers_alcf.sh ------------------- @@ -33,17 +39,18 @@ saveDSenv # export DOLMA_CHUNK_IDX="${DOLMA_CHUNK_IDX:-0}" # # ---- DATA SETUP ------------------------------------ -DATA_FILE_LIST="./data_file_list_shuf_debug.txt" && export DATA_FILE_LIST="${DATA_FILE_LIST}" -NDOCS=$(wc -l < "${DATA_FILE_LIST}") && export NDOCS="${NDOCS}" +dfl_debug="./data_file_list_shuf_debug.txt" +DATA_FILE_LIST="${DATA_FILE_LIST:-${dfl_debug}}" && export DATA_FILE_LIST="${DATA_FILE_LIST}" +NUM_DOCS=$(wc -l < "${DATA_FILE_LIST}") && export NUM_DOCS="${NUM_DOCS}" WEIGHT_SUM="$(sumWeights "${DATA_FILE_LIST}")" && export WEIGHT_SUM="${WEIGHT_SUM}" DFL_STEM=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") && export DFL_STEM="${DFL_STEM}" dcp="${HERE}/.cache/${DFL_STEM}-index-cache" DATA_CACHE_PATH="${DATA_CACHE_PATH:-${dcp}}" && export DATA_CACHE_PATH="${DATA_CACHE_PATH}" mkdir -p "${DATA_CACHE_PATH}" if [[ -n "${DOLMA_CHUNK_IDX}" ]]; then - echo "Using DOLMA CHUNK ${DOLMA_CHUNK_IDX} from ${DATA_FILE_LIST} with ${NDOCS} documents..." + echo "Using DOLMA CHUNK ${DOLMA_CHUNK_IDX} from ${DATA_FILE_LIST} with ${NUM_DOCS} documents..." else - echo "Using NDOCS=${NDOCS} documents from DATA_FILE_LIST=${DATA_FILE_LIST}" + echo "Using NUM_DOCS=${NUM_DOCS} documents from DATA_FILE_LIST=${DATA_FILE_LIST}" fi echo "DOCUMENT WEIGHT_SUM: ${WEIGHT_SUM}" # ---------------------------------------------------- @@ -62,6 +69,7 @@ export HEADS=${HEADS:-32} export NLAYERS=${NLAYERS:-32} export HIDDEN=${HIDDEN:-4096} export NUM_KV_HEAD=${NUM_KV_HEAD:-8} +export FFN_HIDDEN_SIZE=${FFN_HIDDEN_SIZE:-11008} # ---------------------------------------------------- # ---- Run Settings ---------------------------------- @@ -71,15 +79,16 @@ export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 export ZERO_STAGE=${ZERO_STAGE:-2} export MICRO_BATCH=${MICRO_BATCH:-8} export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} -export TOKENIZER_MODEL="${TOKENIZER_MODEL:-"/eagle/datasets/dolma/utils/tokenizer.model"}" -export TRAIN_ITER=${TRAIN_ITER:-317892} -# export TRAIN_ITER="${TRAIN_ITER:-320000}" export EVAL_ITERS="${EVAL_ITERS:-10}" +# export TRAIN_ITER="${TRAIN_ITER:-320000}" +export TRAIN_ITER=${TRAIN_ITER:-317892} export EVAL_INTERVAL="${EVAL_INTERVAL:-50000}" export SAVE_INTERVAL=${SAVE_INTERVAL:-200} +export CKPT_DIR="checkpoints/${OUTPUT_PREFIX}" export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} # export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) +export TOKENIZER_MODEL="${TOKENIZER_MODEL:-"/eagle/datasets/dolma/utils/tokenizer.model"}" export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" # ---------------------------------------------------- @@ -137,14 +146,17 @@ echo "++++++++++++++++++++++++++++++++++++++++++++++++++" # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # ---- Build DeepSpeed Config --------------------------------- -DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" +export DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" bash "${HERE}/generate_config.sh" "${DS_CONFIG}" || exit 1 # ------------------------------------------------------------- # ---- Specify output location -------------------------------- -OUTPUT_PREFIX="${HERE}/logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" +export OUTPUT_PREFIX="ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" # OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME} -OUTPUT_DIR="${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" +OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" +export OUTPUT_DIR="${OUTPUT_DIR}" +export OUTPUT_LOG="${OUTPUT_DIR}/output.log" +echo "${OUTPUT_LOG}" >> "logs/latest" mkdir -p "${OUTPUT_DIR}" echo "!!!Please see logs at ${OUTPUT_DIR}" @@ -205,54 +217,60 @@ EXEC="pretrain_gpt_alcf.py" # --lr-decay-iters 10000 \ # --num-workers 4 \ # launch python3 ${EXEC} \ + # --data-impl mmap \ run_cmd=" deepspeed $launcher ${EXEC} \ --$DTYPE \ + --split 90,5,5 \ + --use-flash-attn-v2 \ + --no-bias-gelu-fusion \ + --lr-decay-style cosine \ + --no-bias-dropout-fusion \ + --no-masked-softmax-fusion \ + --tokenizer-type Llama2Tokenizer \ + --no-gradient-accumulation-fusion \ + --accumulate-allreduce-grads-in-fp32 \ + --use-checkpoint-opt_param-scheduler \ --lr ${LR} \ --log-interval 1 \ --seq-length $SEQ \ - --num-layers $NLAYERS \ - --hidden-size $HIDDEN \ - --ffn-hidden-size 11008 \ - --train-iters $TRAIN_ITER \ + --save ${CKPT_DIR} \ + --load ${CKPT_DIR} \ + --num-layers ${NLAYERS} \ + --hidden-size ${HIDDEN} \ + --train-iters ${TRAIN_ITER} \ --eval-iters ${EVAL_ITERS} \ - --distributed-backend $NCCL \ - --num-attention-heads $HEADS \ - --max-position-embeddings $SEQ \ - --micro-batch-size $MICRO_BATCH \ + --distributed-backend ${NCCL} \ + --num-attention-heads ${HEADS} \ --save-interval ${SAVE_INTERVAL} \ --eval-interval ${EVAL_INTERVAL} \ - --tensor-model-parallel-size $TP \ - --global-batch-size $GLOBAL_BATCH \ - --pipeline-model-parallel-size $PP \ + --max-position-embeddings ${SEQ} \ + --micro-batch-size ${MICRO_BATCH} \ --data-file-list ${DATA_FILE_LIST} \ - --load checkpoints/${OUTPUT_PREFIX} \ - --save checkpoints/${OUTPUT_PREFIX} \ - --data-cache-path ${DATA_CACHE_PATH} \ + --tensor-model-parallel-size ${TP} \ + --global-batch-size ${GLOBAL_BATCH} \ + --pipeline-model-parallel-size ${PP} \ --num-key-value-heads ${NUM_KV_HEAD} \ + --data-cache-path ${DATA_CACHE_PATH} \ + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ --tokenizer-model ${TOKENIZER_MODEL} \ - --split 90,5,5 \ - --data-impl mmap \ - --no-masked-softmax-fusion \ - --no-bias-gelu-fusion \ - --no-bias-dropout-fusion \ - --no-gradient-accumulation-fusion \ - --use-flash-attn-v2 \ - --lr-decay-style cosine \ - --tokenizer-type Llama2Tokenizer \ - --use-checkpoint-opt_param-scheduler \ - --accumulate-allreduce-grads-in-fp32 \ $ds_args \ ${LLAMA_ARGS} \ ${gpt_args[*]} \ $custom_args \ - |& tee $OUTPUT_DIR/output.log + >> ${OUTPUT_LOG} 2>&1 & " + # |& tee $OUTPUT_DIR/output.log # ${EXTRA_ARGS} \ +echo "All DeepSpeed(s): $(which -a deepspeed)" echo "Using $(which deepspeed)" ds_report -echo ${run_cmd} -eval ${run_cmd} +echo "${run_cmd}" + +echo "[!! NOTE] View output at:" +printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" +# echo "${OUTPUT_LOG}" +eval "${run_cmd}" set +x From d3a297278fb2bf55339f7879cd5a2784628f3b6b Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sat, 24 Feb 2024 19:54:21 -0600 Subject: [PATCH 077/268] Update `pretrain_gpt_alcf.py` --- pretrain_gpt_alcf.py | 289 ++++++++++++++++++++++++------------------- 1 file changed, 160 insertions(+), 129 deletions(-) diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index cf9251f632..fe45a67a00 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -70,10 +70,8 @@ def model_provider(pre_process=True, post_process=True): """Build the model.""" - print_rank_0('building GPT model ...') - see_memory_usage(f"Before Building Model", force=True) - + see_memory_usage("Before Building Model", force=True) args = get_args() config = core_transformer_config_from_args(args) if wandb.run is not None: @@ -91,27 +89,36 @@ def model_provider(pre_process=True, post_process=True): assert wandb is not None and wandb.run is not None print(f'Updating {wandb.run.name=} at {wandb.run.url=}') wandb.run.config.update({'args': vars(args)}) - with deepspeed.zero.Init(data_parallel_group=dpg, - remote_device=None if args.remote_device == 'none' else args.remote_device, - config_dict_or_path=args.deepspeed_config_dict, - enabled=args.zero_stage == 3, - mpu=mpu): + with deepspeed.zero.Init( + data_parallel_group=dpg, + remote_device=( + None if args.remote_device == 'none' else args.remote_device + ), + config_dict_or_path=args.deepspeed_config_dict, + enabled=args.zero_stage == 3, + mpu=mpu + ): if args.deepspeed and not args.no_pipeline_parallel: model = GPTModelPipe( config=config, num_tokentypes=0, parallel_output=True ) - # This is a hack to give us a reference to get_batch_pipe from within training.py + # This is a hack to give us a reference to + # get_batch_pipe from within training.py # We need to call model.set_batch_fn after deepspeed.initialize model._megatron_batch_fn = get_batch_pipe - # Predompute the attention mask and store it in args. This avoids having to - # pipeline it as an activation during training. The mask is constant, and thus - # we can reuse it. - attention_mask = torch.tril(torch.ones( - (1, args.seq_length, args.seq_length), device=get_accelerator().current_device_name())).view( - 1, 1, args.seq_length, args.seq_length) + # Predompute the attention mask and store it in args. + # This avoids having to pipeline it + # as an activation during training. + # The mask is constant, and thus we can reuse it. + attention_mask = torch.tril( + torch.ones( + (1, args.seq_length, args.seq_length), + device=get_accelerator().current_device_name() + ) + ).view(1, 1, args.seq_length, args.seq_length) # Convert attention mask to binary: attention_mask = (attention_mask < 0.5) @@ -123,7 +130,8 @@ def model_provider(pre_process=True, post_process=True): # Attention mask must be bool. args.attn_mask = attention_mask.to(torch.bool) - # For prertaining, since sequence length is fixed, cache rotary embedding in args, to avoid communicating around + # For prertaining, since sequence length is fixed, + # cache rotary embedding in args, to avoid communicating around if args.use_rotary_position_embeddings: update_rotary_pos_emb(args.seq_length) @@ -143,8 +151,8 @@ def model_provider(pre_process=True, post_process=True): print_rank_0(f"Number of parameters in model: {num_params}") print_rank_0(80 * '-') see_memory_usage("After Building Model", force=True) - # if wandb.run is not None: - # if torch.cuda.is_available() + if wandb.run is not None: + wandb.run.config.update({'num_params': num_params}) # wandb.run.watch( # model, # log='all', @@ -153,6 +161,7 @@ def model_provider(pre_process=True, post_process=True): # wandb.run.config.update({'num_params': num_params}) return model + def get_batch(data_iterator): """Generate a batch""" args = get_args() @@ -162,11 +171,12 @@ def get_batch(data_iterator): keys = ['text'] datatype = torch.int64 - # Broadcast data. - if data_iterator is not None: - data = next(data_iterator) - else: - data = None + data = next(data_iterator) if data_iterator is not None else None + # # Broadcast data. + # if data_iterator is not None: + # data = next(data_iterator) + # else: + # data = None data_b = tensor_parallel.broadcast_data(keys, data, datatype) # Unpack. @@ -207,6 +217,7 @@ def get_batch(data_iterator): return tokens, labels, loss_mask, attention_mask, position_ids + def data_post_process(data, data_sampler_state_dict): args = get_args() if args.data_efficiency_curriculum_learning: @@ -232,8 +243,12 @@ def data_post_process(data, data_sampler_state_dict): args.data_efficiency_curriculum_learning_seqlen_type = None return data + def get_batch_pipe(data): - """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" + """ + Modification of `get_batch` to work on `next(data_iterator)` + instead of `data_iterator` + """ args = get_args() tokenizer = get_tokenizer() @@ -256,9 +271,13 @@ def get_batch_pipe(data): args.reset_position_ids, args.reset_attention_mask, args.eod_mask_loss) - if args.curriculum_learning_legacy and args.curriculum_seqlen < tokens.size()[1]: + if ( + args.curriculum_learning_legacy + and args.curriculum_seqlen < tokens.size()[1] + ): # seqlen-based curriculum learning - # tokens, position_ids, labels, loss_mask have size [batch size, seqlen] + # tokens, position_ids, labels, loss_mask + # have size [batch size, seqlen] tokens = tokens[:, :args.curriculum_seqlen].contiguous() position_ids = position_ids[:, :args.curriculum_seqlen].contiguous() if labels is not None: @@ -280,18 +299,39 @@ def loss_func(loss_mask, moe_loss, mos_loss, output_tensor): # assert max(args.num_experts) >= 1 loss = loss + moe_loss + mos_loss if args.mos: - return loss, {'total loss': loss, 'lm loss': averaged_loss[0], 'moe loss': moe_loss, 'mos loss': mos_loss} + return loss, { + 'total loss': loss, + 'lm loss': averaged_loss[0], + 'moe loss': moe_loss, + 'mos loss': mos_loss + } elif args.kd: - return loss, {'total loss': loss, 'lm loss': averaged_loss[0], 'moe loss': moe_loss, 'kd loss': mos_loss} - print_rank_0('>>> total loss: {}, lm loss {}, kd loss {}'.format(loss, averaged_loss[0], mos_loss)) + return loss, { + 'total loss': loss, + 'lm loss': averaged_loss[0], + 'moe loss': moe_loss, + 'kd loss': mos_loss + } + print_rank_0( + f'>>> total loss: {loss}, ' + f'lm loss {averaged_loss[0]}, ' + f'kd loss {mos_loss}' + ) else: if max(args.num_experts) <= 1: return loss, {'lm loss': averaged_loss[0]} - else: - loss = loss + moe_loss - return loss, {'lm loss': averaged_loss[0], 'moe loss': moe_loss} + loss = loss + moe_loss + return loss, {'lm loss': averaged_loss[0], 'moe loss': moe_loss} -def calculate_mos_loss(args, stu_output, teacher_model, tokens, position_ids, attention_mask): + +def calculate_mos_loss( + args, + stu_output, + teacher_model, + tokens, + position_ids, + attention_mask +): mos_loss = 0 alpha = args.kd_alpha_ce beta = args.kd_beta_ce @@ -299,24 +339,48 @@ def calculate_mos_loss(args, stu_output, teacher_model, tokens, position_ids, at if teacher_model: with torch.no_grad(): - if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length: + if ( + args.curriculum_learning_legacy and + args.curriculum_seqlen < args.seq_length + ): assert args.curriculum_seqlen is not None curriculum_seqlen = args.curriculum_seqlen tokens = tokens[:, :curriculum_seqlen].contiguous() position_ids = position_ids[:, :curriculum_seqlen].contiguous() - attention_mask = attention_mask[:, :, :curriculum_seqlen, :curriculum_seqlen].contiguous() - # No need to truncate labels as we do not need it for the teacher logits - tea_output, tea_other_losses = teacher_model(tokens, position_ids, attention_mask) - assert stu_output.size() == tea_output.size(), 'teacher and student output should match in size. Student: {}, Teacher: {}, CL seq length {}'.format(stu_output.size(), tea_output.size(), args.curriculum_seqlen) + csl = curriculum_seqlen + attention_mask = ( + attention_mask[:, :, :csl, :csl].contiguous() + ) + # No need to truncate labels + # as we do not need it for the teacher logits + tea_output, tea_other_losses = teacher_model( + tokens, + position_ids, + attention_mask + ) + assert stu_output.size() == tea_output.size(), ( + 'teacher and student output should match in size. ' + f'Student: {stu_output.size()}, ' + f'Teacher: {tea_output.size()}, ' + f'CL seq length {args.curriculum_seqlen}' + ) student_logits = F.log_softmax(stu_output / kd_temp, dim=2) - tea_logits = F.softmax(tea_output / kd_temp, dim=2) # The target logits is expected to be probabilities. If we use log_softmax, then we need to set target_log to true when initializing the KLDivLoss. - - mos_loss = kd_temp * kd_temp * nn.KLDivLoss(reduction='batchmean')(student_logits, tea_logits) + # The target logits is expected to be probabilities. + # If we use log_softmax, + # then we need to set target_log to true + # when initializing the KLDivLoss. + tea_logits = F.softmax(tea_output / kd_temp, dim=2) + + mos_loss = kd_temp * kd_temp * nn.KLDivLoss(reduction='batchmean')( + student_logits, + tea_logits + ) mos_loss = mos_loss.div(args.seq_length) * beta return mos_loss + def forward_step(data_iterator, model): """Forward step.""" args = get_args() @@ -330,21 +394,44 @@ def forward_step(data_iterator, model): if args.data_efficiency_curriculum_learning: args.curriculum_seqlen = tokens.size()[1] - if hasattr(args, 'data_efficiency_curriculum_learning_seqlen_type') and \ - args.data_efficiency_curriculum_learning_seqlen_type == 'seqlen_reshape': - args.data_efficiency_curriculum_learning_numel = torch.numel(tokens) + if ( + hasattr( + args, + 'data_efficiency_curriculum_learning_seqlen_type') + and ( + args.data_efficiency_curriculum_learning_seqlen_type + == 'seqlen_reshape' + ) + ): + args.data_efficiency_curriculum_learning_numel = ( + torch.numel(tokens) + ) if args.mos or args.kd: - # The forward func can return either the loss or the logits, depending on whether passing in the labels or not. + # The forward func can return either the loss or the logits, + # depending on whether passing in the labels or not. stu_output, other_losses = model(tokens, position_ids, attention_mask) - if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length: + if ( + args.curriculum_learning_legacy + and args.curriculum_seqlen < args.seq_length + ): assert args.curriculum_seqlen is not None labels = labels[:, :args.curriculum_seqlen].contiguous() - output_tensor = tensor_parallel.vocab_parallel_cross_entropy(stu_output.contiguous().float(), labels) + output_tensor = tensor_parallel.vocab_parallel_cross_entropy( + stu_output.contiguous().float(), + labels + ) else: - output_tensor, other_losses = model(tokens, position_ids, attention_mask, - labels=labels) - if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length: + output_tensor, other_losses = model( + tokens, + position_ids, + attention_mask, + labels=labels + ) + if ( + args.curriculum_learning_legacy and + args.curriculum_seqlen < args.seq_length + ): loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous() moe_losses = [] @@ -357,10 +444,17 @@ def forward_step(data_iterator, model): if args.mos or args.kd: assert model.training if args.teacher_forward and args.teacher_model is not None: - mos_loss = calculate_mos_loss(args, stu_output, - args.teacher_model[0], tokens, position_ids, attention_mask) + mos_loss = calculate_mos_loss( + args, + stu_output, + args.teacher_model[0], + tokens, + position_ids, + attention_mask + ) - # Output_tensor stores the standard loss, loos_func calculates the total loss. + # Output_tensor stores the standard loss, + # loss_func calculates the total loss. return output_tensor, partial(loss_func, loss_mask, moe_loss, mos_loss) @@ -377,11 +471,11 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): w, fname = f.split() files.append(float(w)) files.append(fname) - elif len(args.data_path)==1 and os.path.isdir(args.data_path[0]): - path=args.data_path[0] + "/" + elif len(args.data_path) == 1 and os.path.isdir(args.data_path[0]): + path = args.data_path[0] + "/" for f in os.listdir(path): - if (os.path.isfile(path + f) and f.find(".bin")!=-1): - files.append(1) + if (os.path.isfile(path + f) and f.find(".bin") != -1): + files.append(1) files.append(path + f.split(".bin")[0]) else: files = args.data_path @@ -404,7 +498,11 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): def command_exists(cmd): - result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) + result = subprocess.Popen( + f'type {cmd}', + stdout=subprocess.PIPE, + shell=True + ) return result.wait() == 0 @@ -429,24 +527,13 @@ def git_ds_info(): else: git_hash = "unknown" git_branch = "unknown" - print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') + print( + f'**** Git info for Megatron: ' + f'git_hash={git_hash} git_branch={git_branch} ****' + ) def main(): - # if RANK == 0: - # setup_wandb() - import socket - from mpi4py import MPI - rank = MPI.COMM_WORLD.rank - - if rank == 0: - master_addr = socket.gethostname() - else: - master_addr = None - - master_addr = MPI.COMM_WORLD.bcast(master_addr, root=0) - os.environ["MASTER_ADDR"] = master_addr - os.environ["MASTER_PORT"] = str(2345) # args = get_args() # # if (args.profile): @@ -471,65 +558,9 @@ def main(): args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, data_post_process=data_post_process ) - # # from megatron.training import get_model - # if wandb.run is not None: - # args = get_args() - # timers = get_timers() - # # model = get_model(model_provider, ModelType.encoder_or_decoder) - # elapsed_time = timers('interval-time').elapsed(barrier=True) - # total_iterations = os.environ.get( - # "TOTAL_ITERATIONS", - # (args.train_iters + args.eval_iters) - # ) - # seq_len = args.seq_length - # elapsed_time_per_iteration = elapsed_time / total_iterations - # if model is not None: - # samples_per_sec, tflops, approx_params_in_billions = throughput_calculator( - # model, - # args, - # elapsed_time, - # total_iterations, - # ) - # # Compute throughput. - # samples_per_sec_per_replica = samples_per_sec / args.data_parallel_size - # tokens_per_sec = samples_per_sec * seq_len - # tokens_per_sec_per_replica = tokens_per_sec / args.data_parallel_size - # sample_consumption_rate = args.consumed_train_samples / elapsed_time - # token_consumption_rate = args.consumed_train_tokens / elapsed_time - # # Tensorboard values. - # tdata = { - # # 'iteration': iteration, - # 'consumed_train_samples': args.consumed_train_samples, - # 'consumed_train_tokens': args.consumed_train_tokens, - # # 'learning_rate': learning_rate, - # # 'batch_size': batch_size, - # # 'loss_scale': loss_scale, - # # 'grad_norm': grad_norm, - # } - # # for key in loss_dict: - # # tdata[f'lm-loss/{key}'] = loss_dict[key] - # - # tdata = {f'train/{k}': v for k, v in tdata.items()} - # # if wbrun is not None and wbrun is wandb.run: - # if wandb.run is not None: - # wandb.run.log(tdata, commit=False) - # tput = { - # 'throughput/iteration-time': elapsed_time_per_iteration, # 1000 ms / s - # 'throughput/samples_per_sec': samples_per_sec, - # 'throughput/samples_per_sec_per_replica': samples_per_sec_per_replica, - # 'throughput/tokens_per_sec': tokens_per_sec, - # 'throughput/tokens_per_sec_per_replica': tokens_per_sec_per_replica, - # 'throughput/tflops': tflops, - # 'throughput/approx_params_in_billions': approx_params_in_billions, - # 'throughput/sample_consumption_rate': sample_consumption_rate, - # 'throughput/token_consumption_rate': token_consumption_rate, - # 'throughput/elapsed_ms_per_iteration': elapsed_time_per_iteration, - # } - # wandb.run.log(tput) return model - if __name__ == "__main__": # git_ds_info() # pretrain(train_valid_test_datasets_provider, From 3b6b94ab713e44a235831de0ab75955765f34f13 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sat, 24 Feb 2024 19:54:37 -0600 Subject: [PATCH 078/268] Update `megatron/training.py` --- megatron/training.py | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index ef32cd3856..7e6c7dc6bb 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -1021,6 +1021,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, writer.add_scalar('optimizer/momentum_abs_max', opt_stats_2[2], iteration) writer.add_scalar('optimizer/weight_abs_max', opt_stats_2[3], iteration) + assert args is not None if iteration % args.log_interval == 0: elapsed_time = timers('interval-time').elapsed(barrier=True) elapsed_time_per_iteration = elapsed_time / total_iterations @@ -1038,6 +1039,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, tokens_per_sec_per_replica = tokens_per_sec / args.data_parallel_size tokens_per_gpu_per_second = tokens_per_sec / args.world_size tokens_per_gpu_per_second_per_replica = tokens_per_gpu_per_second / args.data_parallel_size + wandb_metrics = {} if wandb is not None and getattr(wandb, 'run', None) is not None: assert wandb.run is not None wandb_metrics = { @@ -1055,17 +1057,16 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, } if loss_dict is not None: wandb_metrics |= { - f'loss/{k}': v for k, v in loss_dict.items() + 'loss/iteration': iteration, + **{f'loss/{k}': v for k, v in loss_dict.items()} } - wandb_metrics |= {'loss/iteration': iteration} - if writer: - if args.log_timers_to_tensorboard: - writer.add_scalar('iteration-time/iteration-time', - elapsed_time_per_iteration, iteration) - writer.add_scalar('iteration-time/iteration-time vs samples', - elapsed_time_per_iteration, args.consumed_train_samples) - writer.add_scalar('iteration-time/iteration-time vs tokens', - elapsed_time_per_iteration, args.consumed_train_tokens) + if writer and args.log_timers_to_tensorboard: + writer.add_scalar('iteration-time/iteration-time', + elapsed_time_per_iteration, iteration) + writer.add_scalar('iteration-time/iteration-time vs samples', + elapsed_time_per_iteration, args.consumed_train_samples) + writer.add_scalar('iteration-time/iteration-time vs tokens', + elapsed_time_per_iteration, args.consumed_train_tokens) log_string = ' iteration {:8d}/{:8d} |'.format( iteration, args.train_iters) log_string += ' consumed samples: {:12d} |'.format( @@ -1099,16 +1100,18 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, if avg > 0.0: log_string += ' {}: {:.6E} |'.format(key, avg) total_loss_dict[key] = get_accelerator().FloatTensor([0.0]) - if wandb is not None and getattr(wandb, 'run', None) is not None: - wandb.log(wandb_metrics) if loss_scale is not None: log_string += ' loss scale: {:.1f} |'.format(loss_scale) + wandb_metrics |= {'loss/loss_scale': loss_scale} if grad_norm is not None: log_string += ' grad norm: {:.3f} |'.format(grad_norm) + wandb_metrics |= {'loss/grad_norm': grad_norm} if num_zeros_in_grad is not None: log_string += ' num zeros: {:.1f} |'.format(num_zeros_in_grad) + wandb_metrics |= {'loss/num_zeros_in_grad': num_zeros_in_grad} if params_norm is not None: log_string += ' params norm: {:.3f} |'.format(params_norm) + wandb_metrics |= {'loss/params_norm': params_norm} if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: log_string += ' curriculum seqlen: {:5d} |'.format(args.curriculum_seqlen) if args.random_ltd: @@ -1129,7 +1132,12 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, # Report memory after optimizer state has been initialized. report_memory('(after {} iterations)'.format(iteration)) report_memory_flag = False - timers.log(timers_to_log, normalizer=args.log_interval) + if wandb is not None and getattr(wandb, 'run', None) is not None: + wandb_metrics |= {'training/skiped_iterations': total_loss_dict[skipped_iters_key]} + wandb_metrics |= {'training/nan_iterations': total_loss_dict[nan_iters_key]} + wandb.log(wandb_metrics) + if timers is not None: + timers.log(timers_to_log, normalizer=args.log_interval) return report_memory_flag @@ -1138,6 +1146,7 @@ def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler): timers = get_timers() # Extra barrier is added to make sure # all ranks report the max time. + # assert timers is not None timers('save-checkpoint', log_level=0).start(barrier=True) save_checkpoint(iteration, model, optimizer, opt_param_scheduler) timers('save-checkpoint').stop(barrier=True) From 4628b9f4c6a5157ab65331a7ec0a3f48c63e6cc7 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sat, 24 Feb 2024 20:55:03 -0600 Subject: [PATCH 079/268] Add `train_llama_alcf_aurora.sh` --- train_llama_alcf_aurora.sh | 230 +++++++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 train_llama_alcf_aurora.sh diff --git a/train_llama_alcf_aurora.sh b/train_llama_alcf_aurora.sh new file mode 100644 index 0000000000..04341ca843 --- /dev/null +++ b/train_llama_alcf_aurora.sh @@ -0,0 +1,230 @@ +#!/bin/bash --login +#PBS -l walltime=06:00:00 +#PBS -A argonne_tpc +#PBS -q prod +#PBS -l select=48 +#PBS -l filesystems=eagle:home +# + +function sourceFile() { + fp="$1" + echo "source-ing ${fp}" + if [[ -f "${fp}" ]]; then + # shellcheck source="${fp}" + source "${fp}" + else + echo "ERROR: UNABLE TO SOURCE ${fp}" + fi +} + +# +++++++++++++++ SCRIPT START ++++++++++++++++++++++ +# ---- source ./helpers_alcf.sh --------------------- +HERE=$(python3 -c 'import os; print(os.getcwd())') +sourceFile "${HERE}/helpers_alcf.sh" || exit + +# cd ~/anl_24_release_q4/llm.devkit/Megatron-DeepSpeed || exit +# eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate anl_release_q4v2 +ezpz +setEnv +makeDSenv +makeHostfiles + +# ---- DATA SETUP ------------------------------------ +export DATA_FILE_LIST="./data_file_list_shuf_debug.txt" +NUM_DOCS=$(wc -l < "${DATA_FILE_LIST}") && export NUM_DOCS="${NUM_DOCS}" +WEIGHT_SUM="$(sumWeights "${DATA_FILE_LIST}")" && export WEIGHT_SUM="${WEIGHT_SUM}" +DFL_STEM=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") && export DFL_STEM="${DFL_STEM}" +dcp="${HERE}/.cache/${DFL_STEM}-index-cache" +DATA_CACHE_PATH="${DATA_CACHE_PATH:-${dcp}}" && export DATA_CACHE_PATH="${DATA_CACHE_PATH}" +mkdir -p "${DATA_CACHE_PATH}" +if [[ -n "${DOLMA_CHUNK_IDX}" ]]; then + echo "Using DOLMA CHUNK ${DOLMA_CHUNK_IDX} from ${DATA_FILE_LIST} with ${NUM_DOCS} documents..." +else + echo "Using NUM_DOCS=${NUM_DOCS} documents from DATA_FILE_LIST=${DATA_FILE_LIST}" +fi +echo "DOCUMENT WEIGHT_SUM: ${WEIGHT_SUM}" +# ---------------------------------------------------- + + +# ---- Parallelism Settings -------------------------- +PP=${PP:-1} +TP=${TP:-1} +export PP="${PP}" +export TP="${TP}" +export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" +export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} +# export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${PBS_NODEFILE}")} +# ---------------------------------------------------- + +# ---- Llama2 7B Config ----------------------- +export HEADS=${HEADS:-32} +export NLAYERS=${NLAYERS:-32} +export HIDDEN=${HIDDEN:-4096} +export NUM_KV_HEAD=${NUM_KV_HEAD:-8} +export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" +# --------------------------------------------- + +# ---- Run Settings --------------------------- +export LR=${LR:-0.0003} +export SEQ=${SEQ:-4096} +export DTYPE=${DTYPE:-bf16} +export ZERO_STAGE=${ZERO_STAGE:-2} +export MICRO_BATCH=${MICRO_BATCH:-4} +export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} +export TRAIN_ITER=${TRAIN_ITER:-317892} +export SAVE_INTERVAL=${SAVE_INTERVAL:-200} +export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} +export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) +export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} +export TOKENIZER_MODEL="/lus/gecko/projects/Aurora_deployment/AuroraGPT/datasets/dolma/utils/tokenizer.model" +# export EXTRA_ARGS="" +export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" +# --------------------------------------------- + +# ---- Build DeepSpeed Config --------------------------------- +export DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" +bash "${HERE}/generate_config.sh" "${DS_CONFIG}" || exit +# ------------------------------------------------------------- + + +# ---- Specify output location -------------------------------- +export OUTPUT_PREFIX="ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" +# OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME} +OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" +export OUTPUT_DIR="${OUTPUT_DIR}" +export OUTPUT_LOG="${OUTPUT_DIR}/output.log" +export CKPT_DIR="checkpoints/${OUTPUT_PREFIX}" +echo "${OUTPUT_LOG}" >> "logs/latest" +mkdir -p "${OUTPUT_DIR}" +echo "!!!Please see logs at ${OUTPUT_DIR}" + + +gpt_args=() +ds_args=" " +ds_args=" --deepspeed ${ds_args}" +if [ "$PP" == 1 ]; then + ds_args=" --no-pipeline-parallel ${ds_args}" +fi +ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" +ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" + +# BUG: [???] ---------------------------------------------------------------- +# I dont know where this came from... +# > we are now using activation checkpoint provided by megatron, see below. +# --------------------------------------------------------------------------- +# +# NOTE: [???] --------------------------------------------------------------- +# In `train_llama_alcf_polaris.sh` we also pass +# `"--checkpoint-num-layers 1"` +# ---------------------------------------------------------------------------- +if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" + ds_args=" --deepspeed-activation-checkpointing ${ds_args}" + gpt_args+=( + "--checkpoint-activations" + ) + # "--checkpoint-num-layers 1" + # --checkpoint-activations \ + # --deepspeed-activation-checkpointing +fi + +# take custom args +custom_args=" $@" + +# Ensure `./hostfile_deepspeed` and `./hostfile_mpich` exist in $(pwd) +hfds="${HERE}/hostfile_deepspeed" +hfmpi="${HERE}/hostfile_mpich" +[ -f "$hfds" ] || exit +[ -f "$hfmpi" ] || exit + +# launcher setting +LAUNCHER=${LAUNCHER:-MPICH} +if [[ $LAUNCHER == "deepspeed" ]]; then + launcher="" +else + launcher="--force_multi --hostfile ${hfds} --launcher=${LAUNCHER} --launcher_args='-hostfile ${hfmpi}'" +fi + + +if [[ $(hostname) == x4* ]]; then + CCL=${CCL:-ccl} + BE="${CCL}" +elif [[ $(hostname) == x3* ]]; then + NCCL=${NCCL:-nccl} + BE="${NCCL}" +fi +# NCCL=${NCCL:-nccl} +EXEC=pretrain_gpt_alcf.py + +# MODEL=LLAMA_7B +# OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_tp${TP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} +echo "++++++++++++++++++++++++++++++++++++++++++++++++++" +echo "- WORLD_SIZE:${WORLD_SIZE}" +echo "- BACKEND: ${BE}" +echo "- MODEL_TYPE: ${MODEL_TYPE}" +echo "- Using DATA_FILE_LIST: ${DATA_FILE_LIST}" +echo "++++++++++++++++++++++++++++++++++++++++++++++++++" + +run_cmd=" + deepspeed $launcher ${EXEC} \ + --use-flash-attn \ + --num-key-value-heads ${NUM_KV_HEAD} \ + --tensor-model-parallel-size $TP \ + --pipeline-model-parallel-size $PP \ + --num-layers $NLAYERS \ + --hidden-size $HIDDEN \ + --num-attention-heads $HEADS \ + --seq-length $SEQ \ + --max-position-embeddings $SEQ \ + --micro-batch-size $MICRO_BATCH \ + --global-batch-size $GLOBAL_BATCH \ + --train-iters $TRAIN_ITER \ + --lr ${LR} \ + --lr-decay-style cosine \ + --log-interval 1 \ + --save-interval ${SAVE_INTERVAL} \ + --split 100,0,0 \ + --$DTYPE \ + --no-masked-softmax-fusion \ + --no-bias-gelu-fusion \ + --no-bias-dropout-fusion \ + --no-gradient-accumulation-fusion \ + --distributed-backend ${BE} \ + --tokenizer-type Llama2Tokenizer \ + --save checkpoints/${OUTPUT_PREFIX} \ + --load checkpoints/${OUTPUT_PREFIX} \ + --use-checkpoint-opt_param-scheduler \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --data-file-list ${DATA_FILE_LIST} \ + --data-cache-path ${DATA_CACHE_PATH} \ + $ds_args \ + ${LLAMA_ARGS} \ + ${gpt_args[*]} \ + $custom_args \ + >> ${OUTPUT_LOG} 2>&1 & + " + # |& tee $OUTPUT_DIR/output.log + +# --ffn-hidden-size 11008 \ +# --vocab-file $VOCAB_FILE \ +# --merge-file $MERGE_FILE \ +# --lr-decay-iters 320000 \ +# --num-workers 0 \ +# --eval-iters ${EVAL_ITERS} \ +# --eval-interval ${EVAL_INTERVAL} \ +# --lr-warmup-iters 5000 \ +# --lr-decay-iters 10000 \ +# --accumulate-allreduce-grads-in-fp32 \ +# --data-impl mmap \ + +echo "All DeepSpeed(s): $(which -a deepspeed)" +echo "Using $(which deepspeed)" +ds_report + +echo "${run_cmd}" + +echo "[!! NOTE] View output at:" +printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" + +eval "${run_cmd}" +set +x From ef1e83ed18fb56e3ee93d159c574b264d9d35c94 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sat, 24 Feb 2024 20:58:51 -0600 Subject: [PATCH 080/268] Renamed `llama_alcf.sh -> train_llama_alcf_polaris_hzheng.sh` --- llama_alcf.sh => train_llama_alcf_polaris_hzheng.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename llama_alcf.sh => train_llama_alcf_polaris_hzheng.sh (100%) diff --git a/llama_alcf.sh b/train_llama_alcf_polaris_hzheng.sh similarity index 100% rename from llama_alcf.sh rename to train_llama_alcf_polaris_hzheng.sh From 34f72d7fca5abd01b26a4cbae2b4a119bceb49bf Mon Sep 17 00:00:00 2001 From: Varuni Date: Mon, 26 Feb 2024 16:41:53 +0000 Subject: [PATCH 081/268] add util files --- ALCF_utils/fused_stackcode.py | 36 ++++++++++++++++ ALCF_utils/fused_stackcode_bysize.py | 64 ++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 ALCF_utils/fused_stackcode.py create mode 100644 ALCF_utils/fused_stackcode_bysize.py diff --git a/ALCF_utils/fused_stackcode.py b/ALCF_utils/fused_stackcode.py new file mode 100644 index 0000000000..28c1e5c694 --- /dev/null +++ b/ALCF_utils/fused_stackcode.py @@ -0,0 +1,36 @@ +import os +from os import system +import glob +import json +import gzip +import pdb + +def list_json_gz_files(directory): + # Create the search pattern for JSON.gz files + search_pattern = os.path.join(directory, "**/*.json.gz") + + # Use glob to find all files matching the pattern + json_gz_files = glob.glob(search_pattern, recursive=True) + + return json_gz_files + +def combine_json_gz_files(json_gz_files, output_file): + in_list = "" + for i in json_gz_files: + in_list = in_list + " " +str(i) + command = "cat" + in_list + " > " + output_file + print(command) + system(command) + print("done") + +directory_path = "./data/stack-code/" +folder_count = 0 +for folder in os.listdir(directory_path): + print(f"working for folder {folder} {os.path.join(directory_path, folder)}") + folder_count = folder_count + 1 + json_gz_files = list_json_gz_files(os.path.join(directory_path, folder)) + out_path = os.path.join("./fused_stack", folder) + os.makedirs(out_path, exist_ok=True) + output_file = os.path.join(out_path, 'fused.json.gz') + combine_json_gz_files(json_gz_files, output_file) + diff --git a/ALCF_utils/fused_stackcode_bysize.py b/ALCF_utils/fused_stackcode_bysize.py new file mode 100644 index 0000000000..d838369e6b --- /dev/null +++ b/ALCF_utils/fused_stackcode_bysize.py @@ -0,0 +1,64 @@ +import os +from os import system +import glob +import json +import gzip +import pdb + +def list_json_gz_files(directory): + # Create the search pattern for JSON.gz files + search_pattern = os.path.join(directory, "**/*.json.gz") + + # Use glob to find all files matching the pattern + json_gz_files = glob.glob(search_pattern, recursive=True) + + return json_gz_files + +def combine_json_gz_files(json_gz_files, output_file): + in_list = "" + for i in json_gz_files: + in_list = in_list + " " +str(i) + command = "cat" + in_list + " > " + output_file + print(command) + system(command) + print("done ?") + +directory_path = "./fused_stack/" +out_path = "./fused_by_size" +folder_count = 0 +file_list = list_json_gz_files(directory_path) +size_dict = {} +for efile in file_list: + size_of_files = os.stat(efile) + size_dict[efile] = size_of_files.st_size / (1024*1024) # in MBs + +sorted_size_dict = dict(sorted(size_dict.items(), key=lambda item: item[1])) +vol = 0 +sublist = [] +super_list = {} +i=1 +for key, val in sorted_size_dict.items(): + if vol + val > 4608: + # add this item to list and reset vol, sublist + vol = 0 + sublist.append(key) + #print(sublist) + print("************") + super_list[i] = sublist + output_file = out_path + "/fused_stack_" + str(i) + ".json.gz" + print(output_file) + combine_json_gz_files(sublist, output_file) + sublist = [] + i=i+1 + else: + vol = vol + val + sublist.append(key) +#print(t) +#for folder in os.listdir(directory_path): +# print(f"working for folder {folder} {os.path.join(directory_path, folder)}") +# folder_count = folder_count + 1 +# json_gz_files = list_json_gz_files(os.path.join(directory_path, folder)) +# out_path = os.path.join("./fused_stack", folder) +# os.makedirs(out_path, exist_ok=True) +# output_file = os.path.join(out_path, 'fused.json.gz') +# combine_json_gz_files(json_gz_files, output_file) From 5a0fa3016b3c500b17b7eb74197798db4b21b834 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 28 Feb 2024 09:44:03 -0600 Subject: [PATCH 082/268] Add `ALCF_utils/{test_blend.sh,test_blendable_dataset.py}` --- ALCF_utils/test_blend.sh | 73 ++++++++++++++++++++++++++++ ALCF_utils/test_blendable_dataset.py | 73 ++++++++++++++++++++++++++++ 2 files changed, 146 insertions(+) create mode 100755 ALCF_utils/test_blend.sh create mode 100644 ALCF_utils/test_blendable_dataset.py diff --git a/ALCF_utils/test_blend.sh b/ALCF_utils/test_blend.sh new file mode 100755 index 0000000000..9073d2a58c --- /dev/null +++ b/ALCF_utils/test_blend.sh @@ -0,0 +1,73 @@ +#!/bin/bash +#PBS -l walltime=0:30:00 +#PBS -A datascience +#PBS -q debug +#PBS -l select=1 +#PBS -l filesystems=eagle:grand:home +cd ${PBS_O_WORKDIR} +export PPN=4 +export MD=/home/hzheng/ALCF-Megatron-DeepSpeed +module load conda/2023-10-04 +#conda activate /soft/datascience/megatron-deepspeed/2023-10-04 +conda activate $HOME/PolarisAT/pyenvs/megatron/2023-10-04 +export TP=1 +export PP=1 +export SP=128 +export MBS=1 +export BS=$((MBS*SP)) +export export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") +export DATA_FILE_LIST="/eagle/datasets//dolma/chunks-merge/data_file_list_chunk_1_of_4.txt" + +HIDDEN_SIZE=4096 +NUM_LAYERS=32 +SEQ_LENGTH=2048 +EMBEDDINGS=2048 +TRAIN_ITERS=10 +ZERO_STAGE=2 +MODEL=LLAMA_7B +OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} +#MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --cpu-bind depth -d 16 --hostfile $PBS_NODEFILE +python3 ./test_blendable_dataset.py \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --num-layers ${NUM_LAYERS} \ + --hidden-size ${HIDDEN_SIZE} \ + --ffn-hidden-size 5504 \ + --num-attention-heads 32 \ + --micro-batch-size ${MBS} \ + --global-batch-size ${BS} \ + --seq-length ${SEQ_LENGTH} \ + --max-position-embeddings ${EMBEDDINGS} \ + --train-iters 80797 \ + --save ${MD}/checkpoints/${OUTPUT_PREFIX} \ + --load ${MD}/checkpoints/${OUTPUT_PREFIX} \ + --tokenizer-type Llama2Tokenizer \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 3e-4 \ + --lr-decay-style cosine \ + --min-lr 3e-5 \ + --weight-decay 0.1 \ + --clip-grad 1 \ + --lr-warmup-iters 2 \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --log-interval 1 \ + --cpu-optimizer \ + --save-interval 10000 \ + --eval-interval 1000 \ + --eval-iters 10 --fp16 \ + --no-query-key-layer-scaling \ + --attention-dropout 0 \ + --hidden-dropout 0 \ + --use-rotary-position-embeddings \ + --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ + --untie-embeddings-and-output-weights \ + --swiglu --normalization layernorm --disable-bias-linear --num-key-value-heads 4 \ + --tensorboard-dir ./outputs/${OUTPUT_PREFIX}/tensorboard --log-timers-to-tensorboard --tensorboard-log-interval 1 \ + --data-file-list ${DATA_FILE_LIST} \ + --data-path ${DATA_PATH} \ + --data-cache-path /tmp/hzheng-megatron-deepspeed-cache/ \ + --vocab-file ${MD}/dataset/gpt2-vocab.json --merge-file ${MD}/dataset/gpt2-merges.txt \ + --zero-stage=${ZERO_STAGE} --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed diff --git a/ALCF_utils/test_blendable_dataset.py b/ALCF_utils/test_blendable_dataset.py new file mode 100644 index 0000000000..1cf40c5aaf --- /dev/null +++ b/ALCF_utils/test_blendable_dataset.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python +from megatron.data.gpt_dataset import build_train_valid_test_datasets +import numpy as np +from megatron.global_vars import set_args, set_global_variables, get_args +from megatron.arguments import parse_args +from megatron.initialize import initialize_megatron +from megatron.data.data_samplers import build_pretraining_data_loader + +initialize_megatron(allow_no_cuda=True) +args = get_args() + +data_file_list = args.data_file_list +print(f"Reading data from {args.data_file_list}") +files = [] +weights = [] +flist = [] +with open(data_file_list, 'r') as fin: + for f in fin.readlines(): + w, fname = f.split() + weights.append(float(w)) + flist.append(fname) + files.append(float(w)) + files.append(fname) +splits_string="100,0,0" + +weights = np.array(weights) +weights = weights/np.sum(weights) + +num_samples = args.global_batch_size*args.train_iters +num_datasets = len(weights) + +print(f"Number of datasets: {num_datasets}") +print(f"Global batch size: {args.global_batch_size}") +print(f"Training iterations: {args.train_iters}") +train_valid_test_num_samples = [num_samples, 0, 0] +seed=args.seed +data_impl = args.data_impl +skip_warmup = False +seq_length = args.seq_length +splits_string = "1,0,0" + +# Build datasets +train_ds, valid_ds, test_ds = build_train_valid_test_datasets(files, data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup, data_cache_path=args.data_cache_path) + +dataset_idx = [train_ds.dataset_index[i] for i in range(num_samples)] +ratio_select=np.zeros(num_datasets) +#for i in range(num_datasets): +# ratio_select[i] = np.sum([i==d for d in dataset_idx])/num_samples + +print(f"Total number of samples: {len(train_ds)}") +print(f"Weights set: {weights[:min(8, num_datasets)]}") +#print(f"Weights across training: {ratio_select[:min(8, num_datasets)]}") + +for e in range(min(100, args.train_iters)): + ratio_select=np.zeros(num_datasets) + for i in range(num_datasets): + ratio_select[i] = np.sum([i==d for d in dataset_idx[e*args.global_batch_size:(e+1)*args.global_batch_size]])/args.global_batch_size + print(f"iter-{e}: {ratio_select[:min(8, num_datasets)]}") + + +print("First 10 samples") +for i in range(10): + print(f"Sample: {i} \t dataset_idx: {train_ds.dataset_index[i]}, sample_idx: {train_ds.dataset_sample_index[i]}") + + +#### Build data loaders +train_dataloader = build_pretraining_data_loader( + train_ds, args.consumed_train_samples) +valid_dataloader = build_pretraining_data_loader( + valid_ds, args.consumed_valid_samples) +test_dataloader = build_pretraining_data_loader(test_ds, 0) From a4a08c9a9897ac6e883d97e3e0037268369d4cd3 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 28 Feb 2024 09:52:38 -0600 Subject: [PATCH 083/268] Update `train_llama_alcf_aurora.sh` --- train_llama_alcf_aurora.sh | 48 ++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/train_llama_alcf_aurora.sh b/train_llama_alcf_aurora.sh index 04341ca843..5bc9098b61 100644 --- a/train_llama_alcf_aurora.sh +++ b/train_llama_alcf_aurora.sh @@ -8,8 +8,8 @@ function sourceFile() { fp="$1" - echo "source-ing ${fp}" if [[ -f "${fp}" ]]; then + echo "Found ${fp}, \`source\`-ing" # shellcheck source="${fp}" source "${fp}" else @@ -21,29 +21,29 @@ function sourceFile() { # ---- source ./helpers_alcf.sh --------------------- HERE=$(python3 -c 'import os; print(os.getcwd())') sourceFile "${HERE}/helpers_alcf.sh" || exit - # cd ~/anl_24_release_q4/llm.devkit/Megatron-DeepSpeed || exit # eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate anl_release_q4v2 -ezpz -setEnv -makeDSenv -makeHostfiles - -# ---- DATA SETUP ------------------------------------ -export DATA_FILE_LIST="./data_file_list_shuf_debug.txt" -NUM_DOCS=$(wc -l < "${DATA_FILE_LIST}") && export NUM_DOCS="${NUM_DOCS}" -WEIGHT_SUM="$(sumWeights "${DATA_FILE_LIST}")" && export WEIGHT_SUM="${WEIGHT_SUM}" -DFL_STEM=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") && export DFL_STEM="${DFL_STEM}" -dcp="${HERE}/.cache/${DFL_STEM}-index-cache" -DATA_CACHE_PATH="${DATA_CACHE_PATH:-${dcp}}" && export DATA_CACHE_PATH="${DATA_CACHE_PATH}" -mkdir -p "${DATA_CACHE_PATH}" -if [[ -n "${DOLMA_CHUNK_IDX}" ]]; then - echo "Using DOLMA CHUNK ${DOLMA_CHUNK_IDX} from ${DATA_FILE_LIST} with ${NUM_DOCS} documents..." -else - echo "Using NUM_DOCS=${NUM_DOCS} documents from DATA_FILE_LIST=${DATA_FILE_LIST}" -fi -echo "DOCUMENT WEIGHT_SUM: ${WEIGHT_SUM}" -# ---------------------------------------------------- +ezpz || exit +setEnv || exit +saveDSenv || exit +makeHostfiles || exit +setupData "${DATA_FILE_LIST:-${HERE}/data_file_list_shuf_debug.txt}" || exit +# dfl_fallback="${HERE}/data_file_list_shuf_debug.txt" + +# # ---- DATA SETUP ------------------------------------ +# dfl_debug="./data_file_list_shuf_debug.txt" +# DATA_FILE_LIST="${DATA_FILE_LIST:-${dfl_debug}}" && export DATA_FILE_LIST="${DATA_FILE_LIST}" +# NUM_DOCS=$(wc -l < "${DATA_FILE_LIST}") && export NUM_DOCS="${NUM_DOCS}" +# WEIGHT_SUM="$(sumWeights "${DATA_FILE_LIST}")" && export WEIGHT_SUM="${WEIGHT_SUM}" +# DFL_STEM=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") && export DFL_STEM="${DFL_STEM}" +# dcp="${HERE}/.cache/${DFL_STEM}-index-cache" +# DATA_CACHE_PATH="${DATA_CACHE_PATH:-${dcp}}" && export DATA_CACHE_PATH="${DATA_CACHE_PATH}" +# mkdir -p "${DATA_CACHE_PATH}" +# if [[ -n "${DOLMA_CHUNK_IDX}" ]]; then +# echo "Using DOLMA CHUNK ${DOLMA_CHUNK_IDX} from ${DATA_FILE_LIST} with ${NUM_DOCS} documents..." +# else +# echo "Using NUM_DOCS=${NUM_DOCS} documents from DATA_FILE_LIST=${DATA_FILE_LIST}" +# fi # ---- Parallelism Settings -------------------------- @@ -162,7 +162,9 @@ echo "++++++++++++++++++++++++++++++++++++++++++++++++++" echo "- WORLD_SIZE:${WORLD_SIZE}" echo "- BACKEND: ${BE}" echo "- MODEL_TYPE: ${MODEL_TYPE}" +echo "- DOCUMENT WEIGHT_SUM: ${WEIGHT_SUM}" echo "- Using DATA_FILE_LIST: ${DATA_FILE_LIST}" +echo "- Using NUM_DOCS=${NUM_DOCS} documents from DATA_FILE_LIST=${DATA_FILE_LIST}" echo "++++++++++++++++++++++++++++++++++++++++++++++++++" run_cmd=" @@ -223,7 +225,7 @@ ds_report echo "${run_cmd}" -echo "[!! NOTE] View output at:" +printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" eval "${run_cmd}" From 0daba44f8961ae3650b8c0d3283dbb50cb9ba52e Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 28 Feb 2024 09:53:09 -0600 Subject: [PATCH 084/268] Update `train_llama_alcf_polaris.sh` --- train_llama_alcf_polaris.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/train_llama_alcf_polaris.sh b/train_llama_alcf_polaris.sh index 96f6197637..e627c6f8bf 100644 --- a/train_llama_alcf_polaris.sh +++ b/train_llama_alcf_polaris.sh @@ -84,7 +84,6 @@ export EVAL_ITERS="${EVAL_ITERS:-10}" export TRAIN_ITER=${TRAIN_ITER:-317892} export EVAL_INTERVAL="${EVAL_INTERVAL:-50000}" export SAVE_INTERVAL=${SAVE_INTERVAL:-200} -export CKPT_DIR="checkpoints/${OUTPUT_PREFIX}" export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} # export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) @@ -156,11 +155,11 @@ export OUTPUT_PREFIX="ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_B OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" export OUTPUT_DIR="${OUTPUT_DIR}" export OUTPUT_LOG="${OUTPUT_DIR}/output.log" +export CKPT_DIR="checkpoints/${OUTPUT_PREFIX}" echo "${OUTPUT_LOG}" >> "logs/latest" mkdir -p "${OUTPUT_DIR}" echo "!!!Please see logs at ${OUTPUT_DIR}" - # ---- Setup DeepSpeed arguments -------------------------------- ds_args=" " ds_args=" --deepspeed ${ds_args}" From 62ef3c5c39aa1cd05cd085497d845b2d1b083c84 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 28 Feb 2024 09:53:29 -0600 Subject: [PATCH 085/268] Remove old `set_params.sh` --- set_params.sh | 142 -------------------------------------------------- 1 file changed, 142 deletions(-) delete mode 100644 set_params.sh diff --git a/set_params.sh b/set_params.sh deleted file mode 100644 index 8682a86907..0000000000 --- a/set_params.sh +++ /dev/null @@ -1,142 +0,0 @@ -#!/bin/bash login -# echo "!!!please use generate_hostfile.sh to set hostfile for 18 nodes before training" -export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${PBS_NODEFILE}")} -export MICRO_BATCH=${MICRO_BATCH:-1} -export NLAYERS=${NLAYERS:-96} -export HIDDEN=${HIDDEN:-12288} -export HEADS=${HEADS:-96} -export LR=${LR:-0.0003} -export SEQ=${SEQ:-4096} -export TRAIN_ITER=${TRAIN_ITER:-300000} -export EVAL_ITERS=${EVAL_ITERS:-50} -export SAVE_INTERVAL=${SAVE_INTERVAL:-1000} -export EVAL_INTERVAL=${EVAL_INTERVAL:-50000} -export ZERO_STAGE=${ZERO_STAGE:-2} -export DTYPE=${DTYPE:-fp16} -export TP=${TP:-2} -export PP=${PP:-1} -export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} -export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) -export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} - -# echo "USING DATA_FILE_LIST: ${DATA_FILE_LIST}" || exit - - -# bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh $@ - -# Disabling tensor/pipeline parallelism -TP=${TP:-1} -PP=${PP:-1} - -DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" -bash ./generate_config.sh "${DS_CONFIG}" || exit 1 - -OUTPUT_PREFIX="logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" -# OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME} -OUTPUT_DIR="${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" -mkdir -p "${OUTPUT_DIR}" -echo "!!!Please see logs at ${OUTPUT_DIR}" - -# Hostfile path -hostfile_deepspeed=./hostfile_deepspeed -hostfile_mpich=./hostfile_mpich -cat "$PBS_NODEFILE" > hostfile_mpich -cat "$PBS_NODEFILE" > hostfile_deepspeed ; sed -e 's/$/ slots=4/' -i hostfile_deepspeed - -ds_args=" " -ds_args=" --deepspeed ${ds_args}" -if [ "$PP" == 1 ]; then - ds_args=" --no-pipeline-parallel ${ds_args}" -fi -ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" -ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" - -if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then - echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" - ds_args=" --deepspeed-activation-checkpointing ${ds_args}" - # --checkpoint-activations \ - # --deepspeed-activation-checkpointing -fi - -gpt_args=() - -if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then - echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" - gpt_args+=( - "--checkpoint-activations" - "--checkpoint-num-layers 1" - ) -fi -# we are now using activation checkpoint provided by megatron, see below. -# ds_args=" --deepspeed-activation-checkpointing ${ds_args}" -# NUM_KV_HEADS="${NUM_KV_HEADS:-0}" -# if [[ $NUM_KV_HEADS -]] - -# take custom args -custom_args=" $@" - -# launcher setting -LAUNCHER=${LAUNCHER:-MPICH} -if [[ $LAUNCHER == "deepspeed" ]]; then - launcher="" -else - launcher="--force_multi --hostfile $hostfile_deepspeed --launcher=${LAUNCHER} --launcher_args='-hostfile ${hostfile_mpich}'" -fi - -NCCL=${NCCL:-nccl} - -# MODEL=LLAMA_7B -# OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_tp${TP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} - -# --vocab-file $VOCAB_FILE \ -# --merge-file $MERGE_FILE \ -# --lr-decay-iters 320000 \ -run_cmd=" - deepspeed $launcher pretrain_gpt_alcf.py \ - --tensor-model-parallel-size $TP \ - --pipeline-model-parallel-size $PP \ - --num-layers $NLAYERS \ - --hidden-size $HIDDEN \ - --num-attention-heads $HEADS \ - --seq-length $SEQ \ - --max-position-embeddings $SEQ \ - --micro-batch-size $MICRO_BATCH \ - --global-batch-size $GLOBAL_BATCH \ - --train-iters $TRAIN_ITER \ - --lr ${LR} \ - --lr-warmup-iters 5000 \ - --lr-decay-iters 10000 \ - --ffn-hidden-size 11008 \ - --lr-decay-style cosine \ - --data-impl mmap \ - --log-interval 1 \ - --eval-iters ${EVAL_ITERS} \ - --eval-interval ${EVAL_INTERVAL} \ - --save-interval ${SAVE_INTERVAL} \ - --split 90,5,5 \ - --$DTYPE \ - $ds_args \ - --no-masked-softmax-fusion \ - --no-bias-gelu-fusion \ - --no-bias-dropout-fusion \ - --no-gradient-accumulation-fusion \ - --distributed-backend $NCCL \ - --num-workers 0 \ - --tokenizer-type Llama2Tokenizer \ - --save checkpoints/${OUTPUT_PREFIX} \ - --load checkpoints/${OUTPUT_PREFIX} \ - --use-checkpoint-opt_param-scheduler \ - --accumulate-allreduce-grads-in-fp32 \ - --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ - --data-file-list ${DATA_FILE_LIST} \ - ${gpt_args[*]} \ - $custom_args \ - |& tee $OUTPUT_DIR/output.log - " - -echo "Using $(which deepspeed)" -ds_report - -echo ${run_cmd} -eval ${run_cmd} -set +x From 6676252a684a727591509b3dc4fb521a0c6de676 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 28 Feb 2024 09:54:36 -0600 Subject: [PATCH 086/268] Update `helpers_alcf.sh` --- helpers_alcf.sh | 72 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 65 insertions(+), 7 deletions(-) diff --git a/helpers_alcf.sh b/helpers_alcf.sh index c0f3e467f9..e3df380828 100644 --- a/helpers_alcf.sh +++ b/helpers_alcf.sh @@ -29,6 +29,19 @@ saveDSenv() { } > .deepspeed_env } +# makeDSenv() { +# saveDSenv +# } + + +# makeDSenv() { +# echo "PATH=${PATH}" > .deepspeed_env +# echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> .deepspeed_env +# echo "http_proxy=${http_proxy}" >> .deepspeed_env +# echo "https_proxy=${https_proxy}" >> .deepspeed_env +# echo "CFLAGS=${CFLAGS}" >> .deepspeed_env +# echo "PYTHONUSERBASE=$PYTHONUSERBASE" >> .deepspeed_env +# } sumWeights() { local file_list=$1 @@ -108,11 +121,56 @@ makeHostfiles() { } -makeDSenv() { - echo "PATH=${PATH}" > .deepspeed_env - echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> .deepspeed_env - echo "http_proxy=${http_proxy}" >> .deepspeed_env - echo "https_proxy=${https_proxy}" >> .deepspeed_env - echo "CFLAGS=${CFLAGS}" >> .deepspeed_env - echo "PYTHONUSERBASE=$PYTHONUSERBASE" >> .deepspeed_env +setupData() { # dfl: abbrv. for DATA_FILE_LIST + dfl=$1 + printf "Calling: \`setupData()\` with %s\n" "${dfl}" + ndocs=$(wc -l < "${dfl}") + ws=$(sumWeights "${dfl}") + dfl_stem=$(echo "${dfl}" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") + dcp="${HERE}/.cache/${dfl_stem}/index-cache" + mkdir -p dcp + export DATA_FILE_LIST="${dfl}" + export NUM_DOCS="${ndocs}" + export WEIGHT_SUM="${ws}" + export DFL_STEM="${dfl_stem}" + export DATA_CACHE_PATH="${dcp}" + echo "--------------------" + echo "Updated environment:" + printf "DATA_FILE_LIST: %s\n" "${DATA_FILE_LIST}" + printf "NUM_DOCS: %s\n " "${NUM_DOCS}" + printf "WEIGHT_SUM: %s\n" "${WEIGHT_SUM}" + printf "DFL_STEM: %s\n" "${DFL_STEM}" + printf "DATA_CACHE_PATH: %s\n" "${DATA_CACHE_PATH}" + echo "--------------------" +} + +printBlack() { + printf "\e[1;30m%s\e[0m\n" "$@" +} + +printRed() { + printf "\e[1;31m%s\e[0m\n" "$@" +} + +printGreen() { + printf "\e[1;32m%s\e[0m\n" "$@" +} + +printYellow() { + printf "\e[1;33m%s\e[0m\n" "$@" +} + +printBlue() { + printf "\e[1;34m%s\e[0m\n" "$@" +} + +printMagenta() { + printf "\e[1;35m%s\e[0m\n" "$@" +} + +printCyan() { + printf "\e[1;36m%s\e[0m\n" "$@" +} +printWhite() { + printf "\e[1;37m%s\e[0m\n" "$@" } From 747b568bc79543e85817fcd4238509b2a6d05868 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 28 Feb 2024 09:55:47 -0600 Subject: [PATCH 087/268] Update `.gitignore` --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 610af67ac3..3e46cef4c5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +.deepspeed_env +*.bak +.cache/* outputs/ venvs/ wandb/ From 25358b7962c88a3923c10848f413643dc1e6ef44 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Wed, 28 Feb 2024 11:35:58 -0600 Subject: [PATCH 088/268] fixed int8 issue --- megatron/data/blendable_dataset.py | 3 +-- megatron/data/helpers.cpp | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/megatron/data/blendable_dataset.py b/megatron/data/blendable_dataset.py index 2516e58415..f3276c6823 100644 --- a/megatron/data/blendable_dataset.py +++ b/megatron/data/blendable_dataset.py @@ -34,8 +34,7 @@ def __init__(self, datasets, weights, size, *, # Build indicies. def _build_indices(): start_time = time.time() - assert num_datasets < 255 - dataset_index = np.zeros(self.size, dtype=np.uint8) + dataset_index = np.zeros(self.size, dtype=np.int64) dataset_sample_index = np.zeros(self.size, dtype=np.int64) from megatron.data import helpers diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp index 5c3a054875..142f159dd3 100644 --- a/megatron/data/helpers.cpp +++ b/megatron/data/helpers.cpp @@ -17,10 +17,10 @@ using namespace std; const int32_t LONG_SENTENCE_LEN = 512; -void build_blending_indices(py::array_t& dataset_index, +void build_blending_indices(py::array_t& dataset_index, py::array_t& dataset_sample_index, const py::array_t& weights, - const int32_t num_datasets, + const int64_t num_datasets, const int64_t size, const bool verbose) { /* Given multiple datasets and a weighting array, build samples such that it follows those wieghts.*/ @@ -58,7 +58,7 @@ void build_blending_indices(py::array_t& dataset_index, } // Populate the indices. - dataset_index_ptr[sample_idx] = static_cast(max_error_index); + dataset_index_ptr[sample_idx] = static_cast(max_error_index); dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index]; // Update the total samples. From 45a3428ee6d92a8355da0d7d1f227a160bb76553 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 28 Feb 2024 12:35:57 -0600 Subject: [PATCH 089/268] Move `helpers_alcf.sh -> ALCF_utils/helpers_alcf.sh` --- helpers_alcf.sh => ALCF_utils/helpers_alcf.sh | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) rename helpers_alcf.sh => ALCF_utils/helpers_alcf.sh (82%) diff --git a/helpers_alcf.sh b/ALCF_utils/helpers_alcf.sh similarity index 82% rename from helpers_alcf.sh rename to ALCF_utils/helpers_alcf.sh index e3df380828..b70ac43407 100644 --- a/helpers_alcf.sh +++ b/ALCF_utils/helpers_alcf.sh @@ -59,28 +59,28 @@ sumFiles() { done } -setupData() { - cidx=$1 - echo "Caught DOLMA_CHUNK_IDX: ${cidx} !!" - dfl="./chunks-reweighted/10/data_file_list_chunk_${cidx}_of_10.txt" - if [[ -z "${DATA_FILE_LIST}" ]]; then - DATA_FILE_LIST="${dfl}" - else - echo "Caught DATA_FILE_LIST: ${DATA_FILE_LIST} from ENV!!" - fi - NDOCS=$(wc -l < "${DATA_FILE_LIST}") && export NDOCS="${NDOCS}" - WEIGHT_SUM="$(sumWeights "${DATA_FILE_LIST}")" - export WEIGHT_SUM="${WEIGHT_SUM}" - export NDOCS="${NDOCS}" - echo "Using DATA_FILE_LIST: ${DATA_FILE_LIST} with ${NDOCS} documents" - echo "WEIGHT SUM: ${WEIGHT_SUM}" - data_file_list_stem=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") - export DOLMA_CHUNK_IDX="${cidx}" - export DATA_FILE_LIST_STEM="${data_file_list_stem}" - export DATA_CACHE_PATH=".cache/${data_file_list_stem}/index-cache" - mkdir -p "${DATA_CACHE_PATH}" -} - +# setupData() { +# cidx=$1 +# echo "Caught DOLMA_CHUNK_IDX: ${cidx} !!" +# dfl="./chunks-reweighted/10/data_file_list_chunk_${cidx}_of_10.txt" +# if [[ -z "${DATA_FILE_LIST}" ]]; then +# DATA_FILE_LIST="${dfl}" +# else +# echo "Caught DATA_FILE_LIST: ${DATA_FILE_LIST} from ENV!!" +# fi +# NDOCS=$(wc -l < "${DATA_FILE_LIST}") && export NDOCS="${NDOCS}" +# WEIGHT_SUM="$(sumWeights "${DATA_FILE_LIST}")" +# export WEIGHT_SUM="${WEIGHT_SUM}" +# export NDOCS="${NDOCS}" +# echo "Using DATA_FILE_LIST: ${DATA_FILE_LIST} with ${NDOCS} documents" +# echo "WEIGHT SUM: ${WEIGHT_SUM}" +# data_file_list_stem=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") +# export DOLMA_CHUNK_IDX="${cidx}" +# export DATA_FILE_LIST_STEM="${data_file_list_stem}" +# export DATA_CACHE_PATH=".cache/${data_file_list_stem}/index-cache" +# mkdir -p "${DATA_CACHE_PATH}" +# } +# setEnv() { From a907b47813900c44aef15c13cc3c9ef3026a0544 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Wed, 28 Feb 2024 13:57:46 -0600 Subject: [PATCH 090/268] modifying testing dataset --- ALCF_utils/test_blend_full.sh | 73 ++++++++++++++++++++++++++++ ALCF_utils/test_blendable_dataset.py | 2 +- 2 files changed, 74 insertions(+), 1 deletion(-) create mode 100755 ALCF_utils/test_blend_full.sh diff --git a/ALCF_utils/test_blend_full.sh b/ALCF_utils/test_blend_full.sh new file mode 100755 index 0000000000..719ebb4151 --- /dev/null +++ b/ALCF_utils/test_blend_full.sh @@ -0,0 +1,73 @@ +#!/bin/bash +#PBS -l walltime=0:30:00 +#PBS -A datascience +#PBS -q debug +#PBS -l select=1 +#PBS -l filesystems=eagle:grand:home +cd ${PBS_O_WORKDIR} +export PPN=4 +export MD=/home/hzheng/ALCF-Megatron-DeepSpeed +module load conda/2023-10-04 +#conda activate /soft/datascience/megatron-deepspeed/2023-10-04 +conda activate $HOME/PolarisAT/pyenvs/megatron/2023-10-04 +export TP=1 +export PP=1 +export SP=128 +export MBS=1 +export BS=$((MBS*SP)) +export export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") +export DATA_FILE_LIST="/eagle/datasets//dolma/data_file_list_reweighted.txt" + +HIDDEN_SIZE=4096 +NUM_LAYERS=32 +SEQ_LENGTH=2048 +EMBEDDINGS=2048 +TRAIN_ITERS=10 +ZERO_STAGE=2 +MODEL=LLAMA_7B +OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} +#MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --cpu-bind depth -d 16 --hostfile $PBS_NODEFILE +python3 ./test_blendable_dataset.py \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --num-layers ${NUM_LAYERS} \ + --hidden-size ${HIDDEN_SIZE} \ + --ffn-hidden-size 5504 \ + --num-attention-heads 32 \ + --micro-batch-size ${MBS} \ + --global-batch-size ${BS} \ + --seq-length ${SEQ_LENGTH} \ + --max-position-embeddings ${EMBEDDINGS} \ + --train-iters 80797 \ + --save ${MD}/checkpoints/${OUTPUT_PREFIX} \ + --load ${MD}/checkpoints/${OUTPUT_PREFIX} \ + --tokenizer-type Llama2Tokenizer \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 3e-4 \ + --lr-decay-style cosine \ + --min-lr 3e-5 \ + --weight-decay 0.1 \ + --clip-grad 1 \ + --lr-warmup-iters 2 \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --log-interval 1 \ + --cpu-optimizer \ + --save-interval 10000 \ + --eval-interval 1000 \ + --eval-iters 10 --fp16 \ + --no-query-key-layer-scaling \ + --attention-dropout 0 \ + --hidden-dropout 0 \ + --use-rotary-position-embeddings \ + --tokenizer-model /eagle/datasets/dolma/utils/tokenizer.model \ + --untie-embeddings-and-output-weights \ + --swiglu --normalization layernorm --disable-bias-linear --num-key-value-heads 4 \ + --tensorboard-dir ./outputs/${OUTPUT_PREFIX}/tensorboard --log-timers-to-tensorboard --tensorboard-log-interval 1 \ + --data-file-list ${DATA_FILE_LIST} \ + --data-path ${DATA_PATH} \ + --data-cache-path /tmp/hzheng-megatron-deepspeed-cache/ \ + --vocab-file ${MD}/dataset/gpt2-vocab.json --merge-file ${MD}/dataset/gpt2-merges.txt \ + --zero-stage=${ZERO_STAGE} --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed diff --git a/ALCF_utils/test_blendable_dataset.py b/ALCF_utils/test_blendable_dataset.py index 1cf40c5aaf..a0cccbb6cb 100644 --- a/ALCF_utils/test_blendable_dataset.py +++ b/ALCF_utils/test_blendable_dataset.py @@ -35,7 +35,7 @@ train_valid_test_num_samples = [num_samples, 0, 0] seed=args.seed data_impl = args.data_impl -skip_warmup = False +skip_warmup = not args.mmap_warmup seq_length = args.seq_length splits_string = "1,0,0" From 9435b633c39559e1d96d0b651747e0a7673b1642 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 28 Feb 2024 14:21:37 -0600 Subject: [PATCH 091/268] Update `train_llama_alcf_aurora.sh` --- train_llama_alcf_aurora.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/train_llama_alcf_aurora.sh b/train_llama_alcf_aurora.sh index 5bc9098b61..dca3e17243 100644 --- a/train_llama_alcf_aurora.sh +++ b/train_llama_alcf_aurora.sh @@ -19,15 +19,16 @@ function sourceFile() { # +++++++++++++++ SCRIPT START ++++++++++++++++++++++ # ---- source ./helpers_alcf.sh --------------------- +cd "${PBS_O_WORKDIR}" || exit HERE=$(python3 -c 'import os; print(os.getcwd())') -sourceFile "${HERE}/helpers_alcf.sh" || exit +sourceFile "${HERE}/ALCF_utils/helpers_alcf.sh" || exit # cd ~/anl_24_release_q4/llm.devkit/Megatron-DeepSpeed || exit # eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate anl_release_q4v2 ezpz || exit setEnv || exit saveDSenv || exit makeHostfiles || exit -setupData "${DATA_FILE_LIST:-${HERE}/data_file_list_shuf_debug.txt}" || exit +setupData "${DATA_FILE_LIST:-${HERE}/data_file_list_reweighted.txt}" || exit # dfl_fallback="${HERE}/data_file_list_shuf_debug.txt" # # ---- DATA SETUP ------------------------------------ From fdf19045cfd3430d72e4b33e16a9eb45af410e97 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 28 Feb 2024 15:03:33 -0600 Subject: [PATCH 092/268] Set `skip_warmup=True` in `pretrain_gpt_alcf.py` --- pretrain_gpt_alcf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 936ea5ec4b..4fefef795f 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -487,7 +487,8 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): train_valid_test_num_samples=train_val_test_num_samples, seq_length=args.seq_length, seed=args.seed, - skip_warmup=(not args.mmap_warmup), + skip_warmup=True, + # skip_warmup=(not args.mmap_warmup), train_data_prefix=args.train_data_path, valid_data_prefix=args.valid_data_path, test_data_prefix=args.test_data_path, From 802a6e896a0fbf91bc0503696c901b36b9f2b5d5 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Wed, 28 Feb 2024 17:05:26 -0600 Subject: [PATCH 093/268] test_blend_full --- ALCF_utils/test_blend_full.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ALCF_utils/test_blend_full.sh b/ALCF_utils/test_blend_full.sh index 719ebb4151..4245304456 100755 --- a/ALCF_utils/test_blend_full.sh +++ b/ALCF_utils/test_blend_full.sh @@ -22,12 +22,11 @@ HIDDEN_SIZE=4096 NUM_LAYERS=32 SEQ_LENGTH=2048 EMBEDDINGS=2048 -TRAIN_ITERS=10 +TRAIN_ITERS=80797 ZERO_STAGE=2 MODEL=LLAMA_7B OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} -#MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --cpu-bind depth -d 16 --hostfile $PBS_NODEFILE -python3 ./test_blendable_dataset.py \ +python3 ALCF_utils/test_blendable_dataset.py \ --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ --num-layers ${NUM_LAYERS} \ @@ -38,7 +37,7 @@ python3 ./test_blendable_dataset.py \ --global-batch-size ${BS} \ --seq-length ${SEQ_LENGTH} \ --max-position-embeddings ${EMBEDDINGS} \ - --train-iters 80797 \ + --train-iters ${TRAIN_ITERS} \ --save ${MD}/checkpoints/${OUTPUT_PREFIX} \ --load ${MD}/checkpoints/${OUTPUT_PREFIX} \ --tokenizer-type Llama2Tokenizer \ @@ -52,6 +51,7 @@ python3 ./test_blendable_dataset.py \ --lr-warmup-iters 2 \ --optimizer adam \ --adam-beta1 0.9 \ + --mmap_warmup False \ --adam-beta2 0.95 \ --log-interval 1 \ --cpu-optimizer \ From 6b33b81649c1636be0bb3c62ba8f1ed5bcb54198 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 1 Mar 2024 09:28:37 -0600 Subject: [PATCH 094/268] changed multiprocessing context --- megatron/data/data_samplers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 2d7da67e15..623e19e70e 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -44,7 +44,7 @@ def build_pretraining_data_loader(dataset, consumed_samples): loader = torch.utils.data.DataLoader(dataset, batch_sampler=batch_sampler, num_workers=args.num_workers, - pin_memory=True) + pin_memory=True, multiprocessing_context='spawn') if args.repeated_dataloader: loader=RepeatingLoader(loader) return loader @@ -187,4 +187,4 @@ def __iter__(self): if len(batch) == self.micro_batch_size: self.consumed_samples += self.micro_batch_times_data_parallel_size yield batch - batch = [] \ No newline at end of file + batch = [] From ea9f6e3cb2edf2fd06cd7eda43cae8a8b39faa6b Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 4 Mar 2024 16:13:49 -0600 Subject: [PATCH 095/268] Update Aurora qsub scripts --- train_llama_alcf_aurora.sh | 3 ++- train_llama_alcf_aurora_qsub.sh | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) create mode 100755 train_llama_alcf_aurora_qsub.sh diff --git a/train_llama_alcf_aurora.sh b/train_llama_alcf_aurora.sh index dca3e17243..48651dbeb1 100644 --- a/train_llama_alcf_aurora.sh +++ b/train_llama_alcf_aurora.sh @@ -204,8 +204,9 @@ run_cmd=" ${LLAMA_ARGS} \ ${gpt_args[*]} \ $custom_args \ - >> ${OUTPUT_LOG} 2>&1 & + |& tee ${OUTPUT_LOG} " + # >> ${OUTPUT_LOG} 2>&1 & # |& tee $OUTPUT_DIR/output.log # --ffn-hidden-size 11008 \ diff --git a/train_llama_alcf_aurora_qsub.sh b/train_llama_alcf_aurora_qsub.sh new file mode 100755 index 0000000000..6f247da9c8 --- /dev/null +++ b/train_llama_alcf_aurora_qsub.sh @@ -0,0 +1,7 @@ +#!/bin/bash --login + + +cd "${PBS_O_WORKDIR}" || exit +eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate anl_release_q4v2 +source /home/foremans/anl_24_release_q4/llm.devkit/setenv.sh +bash ./train_llama_alcf_aurora.sh From 67914f3dce062615dca18e89b7f42edb133a42c1 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 5 Mar 2024 16:42:07 -0600 Subject: [PATCH 096/268] Update `megatron/data/data_samplers.py` --- megatron/data/data_samplers.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 623e19e70e..8eb2f2a668 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -41,10 +41,13 @@ def build_pretraining_data_loader(dataset, consumed_samples): args.dataloader_type)) # Torch dataloader. - loader = torch.utils.data.DataLoader(dataset, - batch_sampler=batch_sampler, - num_workers=args.num_workers, - pin_memory=True, multiprocessing_context='spawn') + loader = torch.utils.data.DataLoader( + dataset, + batch_sampler=batch_sampler, + num_workers=args.num_workers, + pin_memory=True, + # multiprocessing_context='spawn' + ) if args.repeated_dataloader: loader=RepeatingLoader(loader) return loader From 9f84633a28ceae0e4e6d1c2b7f1eacb0fcfa9083 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 6 Mar 2024 21:53:47 -0600 Subject: [PATCH 097/268] Add `ALCF_utils/data_file_list_polaris.txt` --- ALCF_utils/data_file_list_polaris.txt | 3074 +++++++++++++++++++++++++ 1 file changed, 3074 insertions(+) create mode 100644 ALCF_utils/data_file_list_polaris.txt diff --git a/ALCF_utils/data_file_list_polaris.txt b/ALCF_utils/data_file_list_polaris.txt new file mode 100644 index 0000000000..bac6e0cb33 --- /dev/null +++ b/ALCF_utils/data_file_list_polaris.txt @@ -0,0 +1,3074 @@ +0.00029986601436087147 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0553_text_document +0.00025354733193980704 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0299_text_document +0.00022796278454747796 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0366_text_document +0.00015898148098181938 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0753_text_document +0.00032353442734302674 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0429_text_document +0.0003246771202039335 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0372_text_document +0.0002262495314665641 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0124_text_document +0.0003123475130228927 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0437_text_document +0.0002463525690862687 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0053_text_document +0.0002851511545680644 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0615_text_document +0.00017894057315965558 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0182_text_document +0.0002929038657836376 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0713_text_document +0.00025554560858852976 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0688_text_document +0.00026670241496326607 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0166_text_document +0.0003428428123152775 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0768_text_document +0.0002463170207315496 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0692_text_document +0.0002760865339513081 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0041_text_document +0.00033407859155259556 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0416_text_document +0.00034039084474378353 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0630_text_document +0.0003366738865632568 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0639_text_document +0.00017446109057505982 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0225_text_document +0.000253028300825639 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0035_text_document +0.00024506403683874226 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0365_text_document +0.00024185734672422406 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0368_text_document +0.00018780072964724365 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0196_text_document +0.00032000780903059043 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0328_text_document +0.00035000304711647526 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0624_text_document +0.0002569785197146494 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0081_text_document +0.00023270677747769242 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0488_text_document +0.00017835163350772668 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0189_text_document +0.00022425534843704826 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0118_text_document +0.00025809915378253605 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0150_text_document +0.0003372508571793003 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0314_text_document +0.00017405443644933578 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0209_text_document +0.00018999333170612105 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0229_text_document +0.0002578638319368945 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0265_text_document +0.0003026297840378106 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0532_text_document +0.00024115277287793853 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0478_text_document +0.00021079670153911382 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0140_text_document +0.0002641088110552866 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0256_text_document +0.0002734180919243281 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0047_text_document +0.00029907507717544046 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0607_text_document +0.0002524517419857655 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0023_text_document +0.0002411050941613276 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0111_text_document +0.00028888961626426636 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0613_text_document +0.0002666480632163931 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0748_text_document +0.0002433081406689229 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0000_text_document +0.00022825376499345922 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0127_text_document +0.0002446050560023786 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0106_text_document +0.0002942835023242189 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0563_text_document +0.0002763945689252822 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0577_text_document +0.0002491076073033573 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0502_text_document +0.00028576551829606866 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0705_text_document +0.00028829382894385313 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0538_text_document +0.00025082670805714693 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0088_text_document +0.0002630381343492388 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0263_text_document +0.00027810544588847204 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0460_text_document +0.00027747466684446923 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0571_text_document +0.0003172910973599917 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0653_text_document +0.00037520258694866886 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0172_text_document +0.0003211385761767212 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0524_text_document +0.0003255969458125186 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0652_text_document +0.0003388825652663348 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0322_text_document +0.00028869956272163335 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0447_text_document +0.0003205690642373091 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0387_text_document +0.000293306015190877 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0612_text_document +0.00026896512005363497 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0290_text_document +0.000254210476816404 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0339_text_document +0.00022686273928809224 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0487_text_document +0.0002611478888145433 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0396_text_document +0.00017381296052422173 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0178_text_document +0.00023007865273755727 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0091_text_document +0.00018780228063321224 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0193_text_document +0.0003372571297149312 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0408_text_document +0.00021992069576697105 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0496_text_document +0.0001812811364369899 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0755_text_document +0.00033764559935372575 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0773_text_document +0.0003018753737677833 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0547_text_document +0.0003166294761680392 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0384_text_document +0.0002728688935238676 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0574_text_document +0.0003040110074396542 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0533_text_document +0.0002875528493458918 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0464_text_document +0.00022373112059479916 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0489_text_document +0.00024189392824227604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0050_text_document +0.00026046059377324407 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0060_text_document +0.0002480106883840328 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0114_text_document +0.0002569141409359981 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0033_text_document +0.00029923086394617387 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0561_text_document +0.00017765017246365572 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0208_text_document +0.00017213376798867627 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0233_text_document +0.0002807023783016899 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0744_text_document +0.0003444627799304048 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0326_text_document +0.00035341598497146246 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0313_text_document +0.00023246914923423715 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0482_text_document +0.0003151608790524258 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0436_text_document +0.00026075950491544447 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0588_text_document +0.0003108468104526031 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0080_text_document +0.0003070762114877851 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0660_text_document +0.0002572020155743189 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0038_text_document +0.000253216181543879 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0282_text_document +0.00027448432979607844 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0745_text_document +0.00033409492045200607 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0406_text_document +0.00023278669477113861 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0116_text_document +0.00026115519857486115 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0059_text_document +0.0002480831646000483 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0503_text_document +0.00023963680864181876 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0357_text_document +0.0002413187258946405 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0171_text_document +0.0003508576397082831 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0770_text_document +0.0002721719890933602 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0286_text_document +0.0002876539863114944 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0544_text_document +0.00024182215509176508 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0698_text_document +0.00025302595494243913 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0155_text_document +0.0002483407803459808 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0341_text_document +0.0002797174988271654 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0463_text_document +0.00026783545872394773 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0051_text_document +0.0002494040268048752 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0676_text_document +0.0002964792948022985 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0595_text_document +0.00023982416950674604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0174_text_document +0.00017222619019110588 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0198_text_document +0.00023022730483745666 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0480_text_document +0.00024958354880531006 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0687_text_document +0.0002600257919823223 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0145_text_document +0.00027443988098405596 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0004_text_document +0.0002660738793660898 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0583_text_document +0.000284693715918069 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0449_text_document +0.0001797115255501787 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0204_text_document +0.0002819480075336546 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0715_text_document +0.00024558907785431555 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0521_text_document +0.0003452086900121291 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0320_text_document +0.0002849104762477509 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0568_text_document +0.0002520087983082163 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0151_text_document +0.0001744206861947346 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0197_text_document +0.00028747991690444293 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0709_text_document +0.0002459413860995668 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0499_text_document +0.000259317580967894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0006_text_document +0.000263466262658637 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0269_text_document +0.00030976482490632654 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0525_text_document +0.00027110687283220773 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0413_text_document +0.0003179317321820123 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0656_text_document +0.00033906740854304013 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0646_text_document +0.00027369098806344534 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0246_text_document +0.0002986988698925429 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0535_text_document +0.0003395569327922032 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0333_text_document +0.000164638204335171 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0238_text_document +0.00027443002362662267 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0241_text_document +0.00023668635544354816 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0469_text_document +0.00024265468189599862 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0689_text_document +0.00033850556579377534 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0403_text_document +0.0003485968296908193 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0404_text_document +0.00024590976259223263 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0360_text_document +0.0001791680338033577 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0191_text_document +0.00017041559638243079 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0236_text_document +0.00025180889395144256 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0032_text_document +0.00028885663919789436 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0445_text_document +0.00029091485105272474 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0614_text_document +0.00023445115384250546 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0490_text_document +0.00032117096366987005 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0651_text_document +0.00028813277753017444 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0703_text_document +0.00028764255585627627 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0702_text_document +0.0003501536879260528 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0623_text_document +0.00027353701536416775 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0719_text_document +0.00028375403705731966 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0728_text_document +0.0002681400162763699 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0031_text_document +0.0002801025363961944 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0253_text_document +0.0003054975830412967 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0551_text_document +0.0003336419698238177 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0327_text_document +0.0002783342023297327 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0027_text_document +0.0002269098949925595 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0491_text_document +0.0002614046304668415 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0395_text_document +0.0002389768503487008 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0473_text_document +0.00030914674481377635 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0662_text_document +0.0003367403637081444 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0312_text_document +0.00029362370355206103 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0605_text_document +0.0002934194986726569 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0455_text_document +0.0002720662187997295 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0580_text_document +0.00026145129858319734 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0005_text_document +0.00034377973283976473 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0311_text_document +0.00024579413020292225 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0305_text_document +0.0002647178135703726 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0260_text_document +0.00029551321578672775 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0566_text_document +0.0003056359366755917 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0670_text_document +0.00023036150528601932 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0129_text_document +0.0002785463392591407 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0742_text_document +0.0003002481568613867 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0549_text_document +0.000247964769587491 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0058_text_document +0.00024596805385249104 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0501_text_document +0.00024541585230637415 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0071_text_document +0.00029445519245471536 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0450_text_document +0.0003371086021632213 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0375_text_document +0.00022005213043345582 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0131_text_document +0.0002415459805374422 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0697_text_document +0.00024822493157675423 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0415_text_document +0.00030080408063844975 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0560_text_document +0.0003279134620384162 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0643_text_document +0.00023726361757455696 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0699_text_document +0.00023085162610295972 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0515_text_document +0.0002745291533808234 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0739_text_document +0.0002506138351201408 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0092_text_document +0.00024556923836562873 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0046_text_document +0.00023090386526885743 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0083_text_document +0.00029727809725102304 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0443_text_document +0.00026952806591177387 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0746_text_document +0.00031532525062300405 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0655_text_document +0.00033089961611781194 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0427_text_document +0.0002903020283829348 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0603_text_document +0.00022808149386370916 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0367_text_document +0.00033365210425584645 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0318_text_document +0.00023152652972755692 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0520_text_document +0.0002715873565073543 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0749_text_document +0.00034308166550822705 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0771_text_document +0.0002506620824263125 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0369_text_document +0.0003155532825638363 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0434_text_document +0.00029047172619366075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0602_text_document +0.0002491292800421161 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0349_text_document +0.0002559332562543997 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0763_text_document +0.00028216899144587154 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0731_text_document +0.00021527310967872735 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0338_text_document +0.00029197280872618074 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0462_text_document +0.0002497346241797662 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0347_text_document +0.00032116368527223036 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0649_text_document +0.00017893692809443551 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0194_text_document +0.0002377747166387212 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0134_text_document +0.00027551701163288023 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0734_text_document +0.000333287766731292 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0632_text_document +0.00025605546217079896 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0280_text_document +0.00017630373365634742 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0184_text_document +0.00023722922106495997 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0089_text_document +0.0002263994556727904 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0095_text_document +0.00030174055734719644 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0555_text_document +0.0002847362655958324 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0016_text_document +0.00025890418522015135 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0168_text_document +0.0003082976431725841 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0665_text_document +0.000346081899625068 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0767_text_document +0.0003209875815780836 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0666_text_document +0.0002741000975032965 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0737_text_document +0.00025522276682037417 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0037_text_document +0.0003187901431234778 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0648_text_document +0.0002545353756603635 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0064_text_document +0.0002638188827256236 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0764_text_document +0.0003507452430899613 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0323_text_document +0.0002507240659086237 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0009_text_document +0.0003133434303550815 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0545_text_document +0.00017501628240587877 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0212_text_document +0.0002643006640033749 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0015_text_document +0.00026580807028971245 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0267_text_document +0.0002871314019638329 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0727_text_document +0.00030829637332256503 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0661_text_document +0.00017563429413230326 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0211_text_document +0.00017132261428552822 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0220_text_document +0.0002591872665582224 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0278_text_document +0.0002831032389225307 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0721_text_document +0.0002771475730643005 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0718_text_document +0.00016968487666721077 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0207_text_document +0.00023341688807764153 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0619_text_document +0.00028239455769356076 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0400_text_document +0.00017479168321170443 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0754_text_document +0.0002893192781860632 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0610_text_document +0.0002451308928177063 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0358_text_document +0.00016439582430752915 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0758_text_document +0.00025727386724434066 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0298_text_document +0.0001644915605748274 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0756_text_document +0.0002776780120706089 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0729_text_document +0.00023809989233164537 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0468_text_document +0.00025872266999742285 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0397_text_document +0.00026660487071654296 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0247_text_document +0.00025841743419888175 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0149_text_document +0.0002523747545079728 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0119_text_document +0.00024845115079991526 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0010_text_document +0.00024496676763593767 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0093_text_document +0.00029822689182083806 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0386_text_document +0.0002800915174915155 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0045_text_document +0.0002542209027633981 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0066_text_document +0.00027882609660458894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0393_text_document +0.00029269610879889394 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0600_text_document +0.0003170204421857625 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0440_text_document +0.00023451182731251905 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0350_text_document +0.0001713437694272821 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0214_text_document +0.00028770953412712207 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0714_text_document +0.0002521884552358564 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0161_text_document +0.0003496380818870961 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0775_text_document +0.00017772896273136538 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0203_text_document +0.0002500155180274436 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0077_text_document +0.0003263419003478976 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0332_text_document +0.0003298955338846564 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0700_text_document +0.00022675950192557637 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0123_text_document +0.0002502100722272073 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0024_text_document +0.0002807683240860951 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0013_text_document +0.00026633530353392567 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0587_text_document +0.0002526947984544801 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0148_text_document +0.00023452398475010418 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0513_text_document +0.0002494133638577342 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0674_text_document +0.00018960923298675975 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0188_text_document +0.0002979177307236505 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0599_text_document +0.00024756537851651375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0158_text_document +0.00032700344933800113 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0425_text_document +0.00024354258639025316 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0003_text_document +0.00029952785677549897 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0534_text_document +0.0002633271590276356 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0254_text_document +0.00023174553147338835 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0121_text_document +0.00023240024227150026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0099_text_document +0.00031612398651064907 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0373_text_document +0.00022503402642541325 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0479_text_document +0.0003113968430100502 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0379_text_document +0.000250330440380919 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0344_text_document +0.00024438294664384054 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0684_text_document +0.0002809897197744821 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0720_text_document +0.0002705284913469089 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0391_text_document +0.00027569439555606186 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0575_text_document +0.0003410636080348197 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0319_text_document +0.0003203709003963575 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0336_text_document +0.00029253485792760743 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0531_text_document +0.00024432311184342917 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0474_text_document +0.0003191646877331716 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0432_text_document +0.000346686134561813 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0766_text_document +0.0002490456434509238 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0342_text_document +0.0002308722169421376 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0476_text_document +0.00016206769238354457 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0237_text_document +0.0002492309682237096 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0061_text_document +0.00027812802408354336 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0250_text_document +0.00018887116876408207 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0752_text_document +0.00033632001244040556 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0329_text_document +0.0003186771831051032 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0376_text_document +0.0003352297812173437 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0640_text_document +0.00033654919915956005 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0634_text_document +0.00024490246128476605 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0682_text_document +0.00017628521613838942 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0181_text_document +0.0002723200629015754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0076_text_document +0.0002731843102343778 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0244_text_document +0.00024055053612736437 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0690_text_document +0.000260910862554004 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0303_text_document +0.00017227027731699112 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0228_text_document +0.00022621021101649683 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0477_text_document +0.00016795045761426564 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0224_text_document +0.00016929858707299419 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0199_text_document +0.000253504280684824 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0343_text_document +0.0002664008409266226 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0399_text_document +0.00030071197680104803 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0707_text_document +0.0002017638129580961 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0760_text_document +0.0003404886940433186 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0774_text_document +0.00026063745107418964 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0270_text_document +0.0002583694107184419 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0144_text_document +0.0002910352929877009 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0451_text_document +0.00017621100910867517 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0180_text_document +0.00024697352709939353 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0025_text_document +0.00024146432588700466 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0363_text_document +0.00023853893101731814 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0516_text_document +0.0003278831887056581 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0647_text_document +0.0002658199908149806 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0581_text_document +0.00024246994827585146 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0679_text_document +0.00033843099030900046 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0635_text_document +0.00017307547662390532 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0201_text_document +0.0002069914418510578 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0133_text_document +0.0002469419507488919 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0351_text_document +0.0003273613879736889 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0325_text_document +0.00017641368782149634 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0183_text_document +0.00025246502306462557 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0287_text_document +0.00024313653131424496 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0683_text_document +0.000342460619273468 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0316_text_document +0.00025368942675803356 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0275_text_document +0.00031652716411734427 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0424_text_document +0.0002825702051622742 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0461_text_document +0.0002837151182300374 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0576_text_document +0.0002887102616818689 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0390_text_document +0.00024213267448623674 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0052_text_document +0.00023430817410036728 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0086_text_document +0.0002241156128142375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0492_text_document +0.00016148562048123923 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0216_text_document +0.00034595368155095567 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0772_text_document +0.00031905795534990306 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0439_text_document +0.00026930584215843127 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0249_text_document +0.00022527795969533147 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0493_text_document +0.0002608207889423435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0593_text_document +0.00029249050004478847 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0442_text_document +0.00017081784506721255 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0218_text_document +0.00023511433888138094 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0484_text_document +0.00024698036957836997 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0346_text_document +0.0002460253656495418 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0157_text_document +0.00024076940542792902 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0352_text_document +0.00030382220501968567 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0441_text_document +0.00023755063481592102 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0486_text_document +0.00028417867237664216 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0537_text_document +0.000225984786690567 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0485_text_document +0.00024807815243794377 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0164_text_document +0.0002503897564561716 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0022_text_document +0.00029213581748685935 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0458_text_document +0.00023531327437959358 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0497_text_document +0.00025057009253245374 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0170_text_document +0.00026922193390778215 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0154_text_document +0.0002748674817686949 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0751_text_document +0.00023953538828395883 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0048_text_document +0.00032242897439078245 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0428_text_document +0.00022552802583542902 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0418_text_document +0.00022344225002222384 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0112_text_document +0.00016663075921299214 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0757_text_document +0.00032178194519251073 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0421_text_document +0.00023947291743916702 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0471_text_document +0.00023146797086919565 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0510_text_document +0.00021323599497854087 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0466_text_document +0.0003337629126591212 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0641_text_document +0.00029173993384632753 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0601_text_document +0.00027480517202427865 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0740_text_document +0.00027104181603306824 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0594_text_document +0.000259327422305542 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0276_text_document +0.00031194191484921015 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0383_text_document +0.00016906057458032387 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0232_text_document +0.00027723921638928413 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0717_text_document +0.0003242034882586253 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0644_text_document +0.00025144344474404445 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0518_text_document +0.0002742433658271999 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0743_text_document +0.00024688821993792026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0673_text_document +0.00024027301297788078 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0044_text_document +0.0003140105907826361 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0667_text_document +0.00024234707516924134 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0308_text_document +0.0002517457940480414 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0675_text_document +0.00027377464202820747 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0572_text_document +0.00027127850723190206 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0579_text_document +0.0002801181272573297 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0723_text_document +0.0003044955357721288 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0381_text_document +0.0001773914653416496 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0759_text_document +0.000244260849707107 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0504_text_document +0.00028690435806017796 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0708_text_document +0.0002549618760436977 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0049_text_document +0.0003338929078012418 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0642_text_document +0.00025438827048507865 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0074_text_document +0.00024788324580290473 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0039_text_document +0.0002657964126243008 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0401_text_document +0.00024963314944488873 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0409_text_document +0.0002474347556695685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0014_text_document +0.00023102994458882423 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0098_text_document +0.0002629133147259061 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0146_text_document +0.0002878014380556544 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0616_text_document +0.0002484505616779537 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0101_text_document +0.0002979288985446429 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0446_text_document +0.00029660556996978065 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0565_text_document +0.00026095434544066553 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0295_text_document +0.0002831348173037093 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0730_text_document +0.0002365956782695563 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0498_text_document +0.00032328415452513646 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0638_text_document +0.00024548967283056556 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0301_text_document +0.00022916761725282506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0139_text_document +0.00017135825898458961 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0192_text_document +0.00025038733972490736 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0001_text_document +0.0002612597204533967 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0268_text_document +0.00030775483508702073 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0527_text_document +0.0002450825999341174 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0359_text_document +0.00035290511311749176 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0315_text_document +0.00026505364760999234 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0251_text_document +0.0003160354002570778 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0546_text_document +0.00027593535501232487 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0262_text_document +0.0003183575069367642 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0659_text_document +0.0002947758106315529 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0567_text_document +0.0001741629156660883 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0190_text_document +0.00025352755314579954 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0078_text_document +0.00024964796075168576 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0175_text_document +0.00025506541936154865 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0054_text_document +0.00025555970306634487 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0008_text_document +0.00029524207821389497 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0452_text_document +0.000199634928805321 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0187_text_document +0.00025285663055065095 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0011_text_document +0.00021297141649242722 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0138_text_document +0.00022778759217604392 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0087_text_document +0.0001953318184154979 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0206_text_document +0.0002870975629878886 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0611_text_document +0.00024458163981459747 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0509_text_document +0.0001787757081345656 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0205_text_document +0.00023554937871277356 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0620_text_document +0.0002520185236716444 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0677_text_document +0.00022301112872055435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0132_text_document +0.00025561244200441604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0296_text_document +0.0002253012068856598 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0495_text_document +0.00029101930659581156 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0444_text_document +0.0003009753416192063 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0598_text_document +0.00024953859390345224 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0691_text_document +0.00035416563691401055 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0761_text_document +0.00021080283440929313 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0417_text_document +0.00033863782739706187 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0317_text_document +0.0002768289718009883 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0578_text_document +0.0003211028360358407 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0374_text_document +0.00025217196207723947 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0055_text_document +0.00022020631342061864 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0481_text_document +0.00024394187795805726 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0307_text_document +0.0002774350481105133 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0736_text_document +0.00021927408285748486 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0136_text_document +0.00030251214218079053 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0550_text_document +0.00023376098042937013 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0084_text_document +0.00024484618716366375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0511_text_document +0.00032440721841087237 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0380_text_document +0.00023199362334417763 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0356_text_document +0.00035580287272947756 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0310_text_document +0.0002526262802535738 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0110_text_document +0.0003063445757116061 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0668_text_document +0.00024646752300382937 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0306_text_document +0.00022331705652869183 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0115_text_document +0.0003421795194248901 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0324_text_document +0.00017598828219869743 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0202_text_document +0.00025019952804968546 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0294_text_document +0.0002964977996849329 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0704_text_document +0.00034193340092088606 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0629_text_document +0.0003000100913454835 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0608_text_document +0.00033314792957463103 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0627_text_document +0.00027662517486203096 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0725_text_document +0.0002308977883551111 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0472_text_document +0.00016969055286069522 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0230_text_document +0.00034668143602649003 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0407_text_document +0.0003072554613775016 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0556_text_document +0.00024691426562678927 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0505_text_document +0.00025896543472554137 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0040_text_document +0.00029337350699737376 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0606_text_document +0.00024039263045799383 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0096_text_document +0.00025688438655402954 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0281_text_document +0.00017826393466000563 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0179_text_document +0.0002987145843993446 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0557_text_document +0.0002545665977705435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0288_text_document +0.0003407221032168543 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0769_text_document +0.00033154596910064173 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0370_text_document +0.00026418756319139476 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0017_text_document +0.00024071241177027537 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0694_text_document +0.0003034159035548678 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0385_text_document +0.00021807629186309122 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0130_text_document +0.00029528370867186834 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0562_text_document +0.00024176561585449852 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0506_text_document +0.0002435688410132227 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0036_text_document +0.00017344067673492444 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0217_text_document +0.00025145685285303255 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0289_text_document +0.00027990862231094815 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0712_text_document +0.0002803072024649089 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0724_text_document +0.0002913097590657721 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0564_text_document +0.00022731318225581286 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0105_text_document +0.00022996707132323673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0120_text_document +0.00021632449585577137 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0141_text_document +0.00031827065104563006 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0431_text_document +0.0002819283047772193 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0142_text_document +0.0002815277519333656 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0570_text_document +0.00023765405087801319 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0512_text_document +0.00018998873335086353 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0227_text_document +0.00024146656166238478 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0411_text_document +0.0003004350285587871 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0389_text_document +0.000278065966594337 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0735_text_document +0.000281790862465637 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0585_text_document +0.0002439702643406182 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0122_text_document +0.0002537403759927729 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0042_text_document +0.0002423193871139386 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0309_text_document +0.0003495948469686597 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0765_text_document +0.00033236219927066605 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0636_text_document +0.0002858695230288609 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0539_text_document +0.0002348134342912561 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0467_text_document +0.0002691883546328634 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0586_text_document +0.00026279179878842626 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0750_text_document +0.00018511392917200416 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0200_text_document +0.00030242676078679127 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0671_text_document +0.00030050997791376127 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0530_text_document +0.0002635999306185633 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0012_text_document +0.00024458976049193177 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0082_text_document +0.0002518674158063562 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0160_text_document +0.0003223699699116345 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0438_text_document +0.00018505528728151655 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0195_text_document +0.000178243040573062 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0185_text_document +0.00016914020058443556 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0215_text_document +0.00025069619779996 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0173_text_document +0.0002815996313559731 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0710_text_document +0.0002412312344016045 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0348_text_document +0.00026767326358785484 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0590_text_document +0.0002457918957813268 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0073_text_document +0.00024745396271518434 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0030_text_document +0.000261259507683573 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0079_text_document +0.0002548186120067791 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0072_text_document +0.00027399569205024244 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0019_text_document +0.00021413864579792835 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0239_text_document +0.0002472353073125973 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0410_text_document +0.0002940208200391643 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0453_text_document +0.00028793368048032474 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0543_text_document +0.00024228067857454152 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0007_text_document +0.0002757861162724024 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0733_text_document +0.00022515235419999868 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0125_text_document +0.0002789873312304594 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0569_text_document +0.00032181605988926585 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0331_text_document +0.00025600745719208296 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0062_text_document +0.0002429596099341584 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0043_text_document +0.00021272272444376823 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0433_text_document +0.00016894488939341143 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0235_text_document +0.00029657717456198863 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0448_text_document +0.0002464749943157843 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0696_text_document +0.0002990022489754111 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0559_text_document +0.0002686072777622895 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0392_text_document +0.000310214707858456 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0664_text_document +0.00023147049770827952 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0483_text_document +0.00027094316845184026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0591_text_document +0.0002630382106124418 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0272_text_document +0.00028196904286546527 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0271_text_document +0.00025700449997059283 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0340_text_document +0.00025554555240646604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0159_text_document +0.00025089565112080837 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0153_text_document +0.00028712055905116097 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0541_text_document +0.00024771562469786036 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0028_text_document +0.00025202328267601593 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0067_text_document +0.0001684887169833427 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0222_text_document +0.0002607742259699303 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0165_text_document +0.00022397819464514847 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0117_text_document +0.00030314913238885265 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0669_text_document +0.00023619497538123523 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0103_text_document +0.0002734489760396025 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0258_text_document +0.00023077416637994847 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0097_text_document +0.00022084705451473572 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0419_text_document +0.00029499302425346795 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0459_text_document +0.00029225502792904867 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0609_text_document +0.0002294695004036345 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0104_text_document +0.00021262811723288358 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0430_text_document +0.00027993786496231197 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0582_text_document +0.00029708818969150343 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0457_text_document +0.00029628163028225886 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0529_text_document +0.000262417089919526 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0029_text_document +0.0003298930375057276 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0426_text_document +0.0002575102330835925 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0279_text_document +0.00028844680503198393 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0596_text_document +0.00033707683763665075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0631_text_document +0.0002473741577858656 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0517_text_document +0.00023231405560125397 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0507_text_document +0.0002670076117276825 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0252_text_document +0.0003441336056088313 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0626_text_document +0.00028441868553742185 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0056_text_document +0.00031718446831561955 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0335_text_document +0.0002803626135482851 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0542_text_document +0.0002173887650989829 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0672_text_document +0.0002539392289101208 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0284_text_document +0.00026280233213567066 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0257_text_document +0.00031105597221457113 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0654_text_document +0.0002443105227741655 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0678_text_document +0.00024790807827507997 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0018_text_document +0.0003036707221560443 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0528_text_document +0.00022103058913292817 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0422_text_document +0.000244961408904958 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0068_text_document +0.00027503974364758305 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0245_text_document +0.0002854117937664233 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0617_text_document +0.0002620533561829337 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0255_text_document +0.0002596756834115267 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0176_text_document +0.0002881031625880268 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0732_text_document +0.0001650387853828719 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0221_text_document +0.0003234602042935272 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0371_text_document +0.00021389341379725142 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0137_text_document +0.00036936242757777487 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0177_text_document +0.00031381814871258624 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0382_text_document +0.0002539766707583296 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0291_text_document +0.00029021877762037306 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0597_text_document +0.0003421392407209083 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0321_text_document +0.00028701185284984564 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0292_text_document +0.0002666537457275393 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0273_text_document +0.0002818729480708607 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0716_text_document +0.00022785209460036168 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0094_text_document +0.0002464632290244915 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0026_text_document +0.0002557240597452918 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0021_text_document +0.0002650994330587443 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0242_text_document +0.000165454036266872 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0226_text_document +0.0002302794555560894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0100_text_document +0.00024098227294979652 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0519_text_document +0.00030313866988967254 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0552_text_document +0.00024472340746819576 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0057_text_document +3.716455926414704e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0776_text_document +0.0002289064733184156 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0109_text_document +0.00017047069361339454 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0219_text_document +0.00028965044054042247 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0706_text_document +0.00024215674906462835 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0508_text_document +0.00027868463874204896 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0722_text_document +0.0002863276814410383 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0456_text_document +0.00033593485551269495 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0633_text_document +0.00022535057625369098 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0618_text_document +0.00025951432987823613 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0354_text_document +0.00023660867076665115 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0514_text_document +0.00023164910184270907 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0475_text_document +0.0002717934880479726 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0034_text_document +0.00025551605524358457 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0300_text_document +0.00028788385285618325 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0454_text_document +0.00023790235664678007 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0693_text_document +0.0003083326960421146 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0658_text_document +0.00026621224495881597 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0304_text_document +0.00023989172020287585 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0090_text_document +0.0003075436157979873 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0548_text_document +0.00025027414775809285 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0063_text_document +0.00028423970956347 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0465_text_document +0.0003385868271697667 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0414_text_document +0.0002461873511746418 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0169_text_document +0.0003497705905560846 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0762_text_document +0.0002837602935731654 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0701_text_document +0.00021285518550466257 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0135_text_document +0.00025887155855837775 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0143_text_document +0.00024612669665761364 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0167_text_document +0.00024926275847218885 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0523_text_document +0.0002831882638199006 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0540_text_document +0.0003218801435428595 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0334_text_document +0.0002504527051869514 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0297_text_document +0.00027958478618795354 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0738_text_document +0.00024632955113681093 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0500_text_document +0.0002959791358976716 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0554_text_document +0.00023808022251730368 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0695_text_document +0.0002693302215640041 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0747_text_document +0.00024118244197147295 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0108_text_document +0.00024254119141966469 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0113_text_document +0.00024082979776199307 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0686_text_document +0.0002440387750957088 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0302_text_document +0.00024571556691977537 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0362_text_document +0.00024313649385953578 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0355_text_document +0.00033420558196964426 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0645_text_document +0.00022171807844232732 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0102_text_document +0.00025367115675703917 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0681_text_document +0.00029828158945969385 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0536_text_document +0.00026580908134122904 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0156_text_document +0.00026144782973561495 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0163_text_document +0.00026560354050198784 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0277_text_document +0.00027259447866398304 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0573_text_document +0.0002454669644465353 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0293_text_document +0.0002749898352476231 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0741_text_document +0.0003182032271539976 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0377_text_document +0.00032183101559976087 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0423_text_document +0.00026140406796931173 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0592_text_document +0.00030798312041121676 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0657_text_document +0.00027046310241926396 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0584_text_document +0.00022915168410979936 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0628_text_document +0.0003144797967546013 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0240_text_document +0.00024793054681359315 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0361_text_document +0.00025289392596448485 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0680_text_document +0.0003249156807537638 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0388_text_document +0.0003200320573446812 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0435_text_document +0.00024932773314172515 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0345_text_document +0.000352339377109151 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0637_text_document +0.0002181818784932686 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0420_text_document +0.0003065464559171036 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0378_text_document +0.00025564864920100716 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0398_text_document +0.00017648320872558042 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0234_text_document +0.0002556592118802528 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0266_text_document +0.000180741470711273 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0210_text_document +0.000283307650028466 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0711_text_document +0.00024948381052490083 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0162_text_document +0.0002657201622822601 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0264_text_document +0.0002773843344343063 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0070_text_document +0.00017077838788489376 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0223_text_document +0.0002368054569060717 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0470_text_document +0.00025662519161828116 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0002_text_document +0.0002648571396733212 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0069_text_document +0.0002376373641281886 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0625_text_document +0.00025775792581353527 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0285_text_document +0.0002449626349826061 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0065_text_document +0.0002500622157014314 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0147_text_document +0.00023556842149918296 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0085_text_document +0.0002680577361774155 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0394_text_document +0.00033360487612389944 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0330_text_document +0.0003496319072752229 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0621_text_document +0.00026102956713198886 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0248_text_document +0.0002591069584722685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0274_text_document +0.00031214502293740203 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0526_text_document +0.00023818435518274705 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0685_text_document +0.00027077616083688154 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0589_text_document +0.00021454413499194508 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0128_text_document +0.00034352014327578543 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0405_text_document +0.00023102558540219145 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0126_text_document +0.0002447224377071115 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0075_text_document +0.00024328280481448173 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0494_text_document +0.0002720049988519714 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0283_text_document +0.00030256682583053806 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0558_text_document +0.0002627148374838038 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0402_text_document +0.00026820426193286214 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0261_text_document +0.00022606610063852957 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0107_text_document +0.00023977894226853914 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0522_text_document +0.0002971202576914705 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0604_text_document +0.0001695977590443741 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0231_text_document +0.0003090810731707665 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0663_text_document +0.0002808357894937908 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0726_text_document +0.0002470374776308281 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0152_text_document +0.00024230275654617777 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0353_text_document +0.0002875365676033139 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0243_text_document +0.00023312322239977016 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0364_text_document +0.00017547132733894936 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0213_text_document +0.00026021642426790116 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0412_text_document +0.00034525313882415157 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0622_text_document +0.0003216956035139199 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0650_text_document +0.00032148593882094746 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0354_text_document +0.00031099927571265226 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0200_text_document +0.00026288736854021024 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0391_text_document +0.00030120157866719887 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0559_text_document +0.00033262173175005845 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0473_text_document +0.0003282483062358124 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0528_text_document +0.000365895582017286 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0073_text_document +0.00019615092289600435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0170_text_document +0.00019615732562059426 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0180_text_document +0.00029826391241638425 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0582_text_document +0.00022144916152761252 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0067_text_document +0.00025039196312518126 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0036_text_document +0.00023453794655702172 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0023_text_document +0.00024046718292842934 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0099_text_document +0.0003548582198382252 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0342_text_document +0.00032576405247938887 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0353_text_document +0.00025884065661827183 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0044_text_document +0.00028762290770248925 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0575_text_document +0.00022250703391263435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0107_text_document +0.00028658440278656646 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0566_text_document +0.0002242413543649295 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0101_text_document +0.00025523457182268395 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0542_text_document +0.00032490234484662474 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0217_text_document +0.000336341248357978 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0437_text_document +0.0002806238955687308 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0583_text_document +0.0001944036099466144 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0172_text_document +0.00040165432728360835 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0060_text_document +0.00023820268565279 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0072_text_document +0.00027589339829224633 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0393_text_document +0.0003664961403005019 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0414_text_document +0.00032640050094795284 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0355_text_document +0.00026267086768592317 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0082_text_document +0.00027675886876949677 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0281_text_document +0.0002507184125038478 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0051_text_document +0.0002955728326278075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0133_text_document +0.00033895534360196215 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0471_text_document +0.0003249583290321772 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0346_text_document +0.00025269892975402695 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0034_text_document +0.0003597803812098366 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0300_text_document +0.00029054067647750703 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0282_text_document +0.00031819275856278644 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0309_text_document +0.00022047319830454594 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0125_text_document +0.0003339271619378178 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0434_text_document +0.00032250687905768815 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0460_text_document +0.00032737840955958814 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0186_text_document +0.00036817332427086083 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0504_text_document +0.00037520434935067656 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0487_text_document +0.00023497869981097718 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0132_text_document +0.00031875608212684787 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0225_text_document +0.00027112709318564797 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0401_text_document +0.00033378436021836355 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0477_text_document +0.0003539127090987545 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0607_text_document +0.00029769152503833615 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0562_text_document +0.0002635535461200955 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0497_text_document +0.00019676121443972563 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0174_text_document +0.000280048968941906 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0569_text_document +0.0002846942777387065 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0591_text_document +0.00024086454535308076 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0110_text_document +0.00033123896164923103 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0455_text_document +0.00019918331774706222 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0166_text_document +0.00030299320911149845 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0453_text_document +0.00027898564892737796 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0581_text_document +0.00034741102397772714 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0341_text_document +0.00027420078298171223 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0544_text_document +0.000359960106052341 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0606_text_document +0.0003529909755563099 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0291_text_document +0.00026046023186147605 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0086_text_document +0.0002867716530924125 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0578_text_document +0.0001108817386927782 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0083_text_document +0.0003214305663204344 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0357_text_document +0.0003059007996821165 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0188_text_document +0.0003700321466583531 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0411_text_document +0.00031683945155459305 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0218_text_document +0.0002275568413843357 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0003_text_document +0.0002232470277297263 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0001_text_document +0.0002588269802342886 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0543_text_document +0.00030658288378059576 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0307_text_document +0.0001937432554458501 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0169_text_document +0.00021412867224064434 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0149_text_document +0.00029445648351402766 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0561_text_document +0.0003245137199738549 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0310_text_document +0.00028866486070796286 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0222_text_document +0.0002520967807738254 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0547_text_document +0.0003017454528581887 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0183_text_document +0.0003205557761638949 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0214_text_document +0.00022447860828692386 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0111_text_document +0.0002281520513155878 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0127_text_document +0.0002404263491727334 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0054_text_document +0.0002344960672283001 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0007_text_document +0.00029308796650945226 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0572_text_document +0.00024115117947338366 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0103_text_document +0.0003554380548369965 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0334_text_document +0.00022154415907974328 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0114_text_document +0.0003358495217267899 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0513_text_document +0.0002547322407705563 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0237_text_document +0.00025785286289661753 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0397_text_document +0.00033317531686410663 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0306_text_document +0.0002403263232130821 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0120_text_document +0.00023805262922692774 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0129_text_document +0.00025828449660578406 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0262_text_document +0.0003388433440938099 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0459_text_document +0.0001148703143628027 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0085_text_document +0.0003139139975696427 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0206_text_document +0.0002822891138876659 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0271_text_document +0.00035623232436338923 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0610_text_document +0.00023526449481041633 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0031_text_document +0.0002605136896592062 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0390_text_document +0.00023442874329110232 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0043_text_document +0.00024393396170220794 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0012_text_document +0.00036856641866371385 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0409_text_document +0.0002976197958858269 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0574_text_document +0.00020244627431123625 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0596_text_document +0.0002892290973997832 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0588_text_document +0.00027153487772157026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0532_text_document +0.00025976541496692754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0236_text_document +0.00037566485973808487 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0501_text_document +0.0002799158826581256 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0269_text_document +0.0002772943564900957 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0540_text_document +0.00019716161047018854 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0175_text_document +0.0002423031004074836 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0290_text_document +0.00028139406975870025 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0233_text_document +0.00034787707721083003 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0440_text_document +0.0002619461653179241 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0071_text_document +0.00024498614841725265 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0037_text_document +0.0003610826239115939 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0420_text_document +0.0002600795555097568 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0404_text_document +0.0002040950865650348 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0141_text_document +0.00029488273028836727 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0592_text_document +0.00019996210584324688 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0164_text_document +0.00019384504696633997 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0162_text_document +0.00028397293275668543 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0388_text_document +0.00019716886585348885 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0159_text_document +0.00027827065793369874 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0372_text_document +0.0003200957118485349 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0476_text_document +0.0002983546202761619 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0323_text_document +0.0002416269472396373 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0008_text_document +0.000196846988205648 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0151_text_document +0.00027146549828798404 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0533_text_document +0.00033082151018825224 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0344_text_document +0.00031507825757410413 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0481_text_document +0.00031601467832641786 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0204_text_document +0.00020437098539854683 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0179_text_document +0.0003706225395981237 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0496_text_document +0.0003443040119002004 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0469_text_document +0.00022162180585066958 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0055_text_document +0.0002886852383260554 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0367_text_document +0.00028528967629728904 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0277_text_document +0.0002760425208323181 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0603_text_document +0.00035602168449419384 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0512_text_document +0.00033735094405926163 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0340_text_document +0.00020409686476119691 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0143_text_document +0.00019850542947853238 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0140_text_document +0.00026020939632042517 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0285_text_document +0.0002264343023157897 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0124_text_document +0.00027023502738295313 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0531_text_document +0.0002705793532625371 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0375_text_document +0.00022691069665871407 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0013_text_document +0.0003333799802252519 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0522_text_document +0.000243481939995933 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0066_text_document +0.00035504218541120377 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0332_text_document +0.00027735232825717244 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0228_text_document +0.0003184950331829959 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0445_text_document +0.0003508179986368801 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0430_text_document +0.00025129465094801635 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0018_text_document +0.00027981836842029604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0392_text_document +0.0003739220447382562 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0505_text_document +0.000398980676570116 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0485_text_document +0.00029607647166155567 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0130_text_document +0.0002406963619761374 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0026_text_document +0.0002572220385008283 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0489_text_document +0.00023570904655852102 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0006_text_document +0.00020195852509940438 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0157_text_document +0.00032680028348833006 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0467_text_document +0.0003340943214022902 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0454_text_document +0.0003653992024281921 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0600_text_document +0.00019009456253412738 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0163_text_document +0.0002493407030757422 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0248_text_document +0.0003440124023956098 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0339_text_document +0.00025712407819095047 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0534_text_document +0.00024028802893179358 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0038_text_document +0.00020495827636638026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0597_text_document +0.000351176486232251 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0303_text_document +0.0003456182915527073 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0425_text_document +0.00022200464825785848 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0000_text_document +0.0003315535250241385 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0352_text_document +0.00028612336058741426 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0226_text_document +0.00030129740450991597 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0461_text_document +0.00025487427602225745 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0545_text_document +0.00022872773425678075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0102_text_document +0.0002835592166132379 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0587_text_document +0.00033564827847697897 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0048_text_document +0.00033858047476084023 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0336_text_document +0.00037381384462677615 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0502_text_document +0.0003544715223035618 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0427_text_document +0.00022837425047517556 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0090_text_document +0.00026727566447651724 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0369_text_document +0.00033598278535636194 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0216_text_document +0.00036767226126370613 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0292_text_document +0.0002476495487244962 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0243_text_document +0.00033806147568624263 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0326_text_document +0.0002725192743462296 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0602_text_document +0.0003386020917687969 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0611_text_document +0.0002475431480184715 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0499_text_document +0.0002473766564211022 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0032_text_document +0.0002851408177452277 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0599_text_document +0.0002170924906780552 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0097_text_document +0.00032913434544042444 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0182_text_document +0.0002718817554852544 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0378_text_document +0.0003605780424180856 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0509_text_document +0.00020037461147756993 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0139_text_document +0.00028996408514680153 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0456_text_document +0.00029824911284238767 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0322_text_document +0.0002866873513416864 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0221_text_document +0.00020169453438146766 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0153_text_document +0.0002568539383574184 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0076_text_document +0.0002563470964947752 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0057_text_document +0.00028934147258318096 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0284_text_document +0.0002633358277621299 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0075_text_document +0.0003542020133336853 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0422_text_document +0.0002670040366080037 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0288_text_document +0.0002582687175685798 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0077_text_document +0.0003391102438693685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0305_text_document +0.0003008490817946171 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0273_text_document +0.0002653817061873682 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0242_text_document +0.0002448781942433207 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0050_text_document +0.0003398433124009884 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0525_text_document +0.0002860462478845397 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0232_text_document +0.00019774620740653148 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0173_text_document +0.0003435288005031101 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0294_text_document +0.00024599133263112835 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0016_text_document +0.0002717967551816393 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0365_text_document +0.00037903039437164123 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0604_text_document +0.0002088127217590141 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0138_text_document +0.00019764275005400696 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0178_text_document +0.0002824696088080159 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0377_text_document +0.00025439496382211074 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0042_text_document +0.0003406741244113085 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0065_text_document +0.00023424846114057608 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0112_text_document +0.00019313677074461676 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0142_text_document +0.0003095287449412706 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0184_text_document +0.00025333741815045563 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0495_text_document +0.0002817316346120187 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0276_text_document +0.00034078147118718575 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0301_text_document +0.0003040077750531272 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0210_text_document +0.00025716096160298353 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0494_text_document +0.00032046393022247256 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0447_text_document +0.00034897289640574213 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0333_text_document +0.0003296405548599967 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0424_text_document +0.0002740502161790299 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0224_text_document +0.00022714458123629922 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0105_text_document +0.00034363243971404173 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0081_text_document +0.0002872372090352805 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0579_text_document +0.00025581154131057356 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0536_text_document +0.00019475256915422646 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0168_text_document +0.0003692302803692929 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0293_text_document +0.00023876622604802913 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0021_text_document +0.0002882922005665277 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0230_text_document +0.00024564448893769165 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0260_text_document +0.0003225686520264833 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0450_text_document +0.0003634459652928689 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0465_text_document +0.00027645525638440404 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0394_text_document +0.00030963921241643964 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0319_text_document +0.00023334329927625922 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0028_text_document +0.00036313463407885765 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0608_text_document +0.00025564669907419214 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0538_text_document +0.00019278288308391885 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0155_text_document +0.00034000302243969134 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0517_text_document +0.0002509806072007909 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0240_text_document +0.00034220184694416697 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0515_text_document +0.00019740285264181645 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0158_text_document +0.00031502267568822904 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0209_text_document +0.00030626516902061067 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0321_text_document +0.000348986228477158 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0296_text_document +0.0002927218722236596 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0576_text_document +0.00031884703885737373 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0080_text_document +0.0002242778767677176 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0091_text_document +0.00025642014581296705 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0259_text_document +0.00024397330133089134 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0121_text_document +0.00028896335037186975 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0580_text_document +0.00021150345520434648 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0595_text_document +0.00024611075336120286 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0062_text_document +0.0003398153377945975 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0436_text_document +0.0003384490583704485 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0337_text_document +0.00032180256807125913 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0059_text_document +0.00022578733674539733 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0115_text_document +0.0003765131080436743 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0412_text_document +0.0003215742730449396 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0462_text_document +0.0002245422695597009 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0106_text_document +0.00036405219300064597 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0423_text_document +0.00028029912172422236 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0366_text_document +0.00032521630849492397 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0215_text_document +0.0002571120134876151 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0263_text_document +0.000353082459811595 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0295_text_document +0.000326822292642579 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0443_text_document +0.00030304625919441264 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0557_text_document +0.0002265130266993121 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0010_text_document +0.00019743743439428228 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0136_text_document +0.00025992560920928023 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0376_text_document +0.00025253051916689003 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0235_text_document +0.00019383475662344355 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0135_text_document +0.00019960789056665916 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0144_text_document +0.0002678902628605307 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0548_text_document +0.00025659636029182583 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0537_text_document +0.000237672163839986 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0128_text_document +0.0002508337955711484 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0287_text_document +0.00029901333173360016 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0194_text_document +0.0002514033871582251 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0069_text_document +0.00031036518034747307 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0324_text_document +0.00027152269930032273 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0364_text_document +0.0003245993299318324 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0187_text_document +0.0003302486844096023 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0096_text_document +0.0003062956939316864 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0558_text_document +0.00023694500660912133 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0063_text_document +0.0003498659421229062 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0551_text_document +0.00026632904273109524 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0286_text_document +0.00031830571948368423 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0449_text_document +0.0002620222328990345 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0255_text_document +0.00032743994990305114 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0358_text_document +0.0003938666304364765 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0383_text_document +0.00028452006167164925 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0283_text_document +0.0003322166464351345 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0470_text_document +0.000258744238720393 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0403_text_document +0.00032411432235958506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0349_text_document +0.0003296906897932579 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0524_text_document +0.0003443435331616602 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0426_text_document +0.00025288497022764084 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0486_text_document +0.00035861042515638225 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0519_text_document +0.000248397389077527 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0380_text_document +0.00026829844879360724 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0387_text_document +0.00022119227073337001 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0122_text_document +0.00036944219572927674 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0518_text_document +0.0003011917754611523 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0554_text_document +0.000321480091824964 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0027_text_document +0.0003533262482265537 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0418_text_document +0.00023730278437006437 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0104_text_document +0.00023868209081281727 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0039_text_document +0.00029588940231159373 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0268_text_document +0.00032104344076418505 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0201_text_document +0.000292230059627145 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0094_text_document +0.000323725733362109 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0347_text_document +0.00036745776649266704 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0416_text_document +0.0003409543488124674 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0514_text_document +0.0002770342357912649 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0231_text_document +0.0002337063719146717 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0330_text_document +0.00032493466645558633 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0523_text_document +0.00028683852667057725 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0570_text_document +0.0003469424073533543 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0421_text_document +0.0002355667414524093 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0009_text_document +0.00033958966888490423 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0478_text_document +0.00037452714181020425 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0417_text_document +0.00037157900358371685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0482_text_document +0.00032107564301446506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0213_text_document +0.0002779718104193777 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0568_text_document +0.0003302748963883548 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0312_text_document +0.00027108401282429457 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0550_text_document +0.00034186039289312654 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0219_text_document +0.00027051936616135047 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0093_text_document +0.00025501046585739055 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0530_text_document +0.0002946513779650046 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0035_text_document +0.0003252745507933036 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0356_text_document +0.0002657380205070845 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0406_text_document +0.0002620270293581152 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0498_text_document +0.0003653407408166962 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0297_text_document +0.00028067817330870856 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0266_text_document +0.00035628032009124333 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0428_text_document +0.0003460280889408009 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0074_text_document +0.00025565166227441784 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0244_text_document +0.00033985184011624644 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0431_text_document +0.00037036602086903427 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0410_text_document +0.00029949526237892663 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0134_text_document +0.00027881676000299146 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0246_text_document +0.00031647397707654777 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0408_text_document +0.0002967146805014503 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0563_text_document +0.00033325946969851915 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0360_text_document +0.00033990478189638713 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0468_text_document +0.0002584821322450841 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0402_text_document +0.00027278965131770946 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0370_text_document +0.00020295833140368427 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0148_text_document +0.00024279037083080438 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0092_text_document +0.00022359240324368993 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0089_text_document +0.00025058114122039534 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0251_text_document +0.00019085874667820006 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0061_text_document +0.0003549309183005571 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0609_text_document +0.0002647672434074523 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0539_text_document +0.00031752257400591305 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0475_text_document +0.0002522964974057918 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0261_text_document +0.00036196222293690685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0304_text_document +0.00031879924506906604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0203_text_document +0.00026311419200259503 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0084_text_document +0.0002685903870422415 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0395_text_document +0.00021930149045373045 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0131_text_document +0.00031525572610882754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0197_text_document +0.0003250055544565549 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0087_text_document +0.0002494706424870606 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0327_text_document +0.00027762277923554745 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0279_text_document +0.0003083430926571075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0196_text_document +0.00031048286507434094 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0555_text_document +0.00024138490994514737 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0041_text_document +0.00031653009280484387 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0317_text_document +0.00030726372172028754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0193_text_document +0.00034407639895572313 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0441_text_document +0.0003077437581952319 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0202_text_document +0.00035799065644308883 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0511_text_document +0.00030396208504619444 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0220_text_document +0.00031563469263051037 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0474_text_document +0.0002904537122835995 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0577_text_document +0.00032010237765861207 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0014_text_document +0.00019481860586783526 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0171_text_document +0.0003394041823348506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0361_text_document +0.0002558062319927343 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0264_text_document +0.0002513457601549774 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0024_text_document +0.00034069289937398433 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0432_text_document +0.00032158521162506154 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0451_text_document +0.0003039551974410624 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0552_text_document +0.0002904199116648874 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0584_text_document +0.0002538664480925548 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0239_text_document +0.00029585659356578213 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0458_text_document +0.00027200428224862015 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0407_text_document +0.0003061108912685211 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0045_text_document +0.0002685534203724513 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0541_text_document +0.0002687083874265679 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0371_text_document +0.0003239815149554464 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0480_text_document +0.00028480018183138863 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0590_text_document +0.00023635788418747915 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0015_text_document +0.00023164951700334075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0108_text_document +0.00031251045815569193 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0320_text_document +0.00019130373682690652 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0145_text_document +0.0004018105513267898 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0483_text_document +0.00033404230628775514 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0521_text_document +0.0003629045692047148 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0419_text_document +0.00019355538307594888 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0150_text_document +0.00031931294475357857 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0526_text_document +0.0002819143043874387 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0589_text_document +0.00022469124701918232 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0119_text_document +0.00031109478995926487 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0315_text_document +0.00026688680630152287 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0546_text_document +0.00035710114951904826 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0510_text_document +0.0002892627585786743 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0373_text_document +0.00036982120060819184 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0413_text_document +0.00025789399110047885 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0249_text_document +0.00025787093140932716 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0484_text_document +0.00039747640249024 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0493_text_document +0.0002517985792404221 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0491_text_document +0.0002652211441668472 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0385_text_document +0.0002316547001935751 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0005_text_document +0.000290714543042489 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0374_text_document +0.0003159889683761466 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0185_text_document +0.0003369343053203455 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0345_text_document +0.000288343878814311 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0571_text_document +0.00018838166008344263 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0167_text_document +0.0002714687254850031 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0234_text_document +0.00031494166859812457 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0318_text_document +0.0003268206155290055 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0520_text_document +0.00025580125931601587 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0256_text_document +0.00023823591542563343 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0116_text_document +0.00030252744513705306 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0088_text_document +0.0003043223524171429 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0556_text_document +0.00036254935821950336 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0302_text_document +0.0002464195089998822 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0238_text_document +0.00030693311532016543 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0205_text_document +0.00023952239736627944 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0019_text_document +0.0003115236813519545 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0191_text_document +0.00031249741556856433 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0199_text_document +0.00026528939183645053 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0078_text_document +0.0002439509776473377 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0594_text_document +0.00032623079253258036 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0195_text_document +0.00024309468267130917 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0030_text_document +0.00034505079788060524 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0439_text_document +0.000313161148438969 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0448_text_document +0.00033674896385088213 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0350_text_document +0.00027937103577285136 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0267_text_document +0.00027990316836378013 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0275_text_document +0.0003240373233746227 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0348_text_document +0.0002977493763185773 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0560_text_document +0.00019744731070470085 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0181_text_document +0.00024582577153508796 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0329_text_document +0.00034198380503166554 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0516_text_document +0.0003267589630470458 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0363_text_document +0.0002489019202553718 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0258_text_document +0.00032334233658009833 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0359_text_document +0.00035987415471499623 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0299_text_document +0.00032683611343026025 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0457_text_document +0.00025921593001537887 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0379_text_document +0.00022476727556616815 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0049_text_document +0.00027380222381141376 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0368_text_document +0.00025439005748084117 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0265_text_document +0.00023975192095218565 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0046_text_document +0.0003189906411148287 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0311_text_document +0.00020110221297543672 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0177_text_document +0.00024285538318068062 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0058_text_document +0.00023779525016768145 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0040_text_document +0.0002550856807797905 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0549_text_document +0.0003769436968567075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0605_text_document +0.00021456324230875702 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0160_text_document +0.00033854449229250405 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0472_text_document +0.0002487883467469115 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0020_text_document +0.00034510827416249317 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0553_text_document +0.0003135334924716841 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0211_text_document +0.00024364851124096691 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0052_text_document +0.0003541654171496327 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0466_text_document +0.00025475723682504567 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0382_text_document +0.00032694655879361305 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0351_text_document +0.0003713644632352606 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0433_text_document +0.0002797686093879111 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0270_text_document +0.0002759556053746586 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0593_text_document +0.0002669498434795677 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0529_text_document +0.0002821471576067372 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0095_text_document +0.0003133308099231397 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0308_text_document +0.00020949279074901415 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0152_text_document +0.00036351719828329717 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0064_text_document +0.00030678901532121404 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0189_text_document +0.00031709424150066075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0527_text_document +0.00024417776749073654 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0070_text_document +0.0002680981881658349 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0400_text_document +0.0002593798629597245 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0257_text_document +0.00027539122337661716 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0229_text_document +0.00019362359938287403 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0154_text_document +0.0003372318177773526 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0362_text_document +0.0002841575840215986 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0573_text_document +0.00019780262502880867 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0161_text_document +0.00025623938780712984 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0252_text_document +0.00036574619481231154 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0386_text_document +0.00028409153738023557 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0280_text_document +0.00028235078893270346 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0585_text_document +0.0003591167512270668 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0464_text_document +0.0003469074251062574 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0338_text_document +0.0002969586965778641 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0278_text_document +0.00037523287720373535 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0506_text_document +0.0002449994567974368 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0033_text_document +0.0001996403556767957 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0137_text_document +0.00033148641160045666 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0444_text_document +0.0002586467098165628 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0247_text_document +0.00023073263565077901 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0109_text_document +0.0002441039887082391 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0004_text_document +0.00023271649855413572 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0011_text_document +0.0003741176027693515 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0508_text_document +0.0002198975659046473 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0126_text_document +0.00023134591341784835 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0017_text_document +0.0002488855690185143 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0254_text_document +0.0002872080458059506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0567_text_document +0.00023614036077450578 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0100_text_document +0.0002675740169569111 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0398_text_document +0.00022979888140300082 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0117_text_document +0.00020928513439559754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0147_text_document +0.00021373565320847123 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0176_text_document +0.00019546090607062898 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0156_text_document +0.0002686258383757261 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0490_text_document +0.00025788665245942143 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0022_text_document +0.0003171090744680125 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0190_text_document +0.00031782937252322326 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0047_text_document +0.00031802026676613546 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0207_text_document +0.00031662885342709164 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0446_text_document +0.00027908551130471514 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0227_text_document +0.00033733035254319147 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0435_text_document +0.00027399448876130287 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0289_text_document +0.00019694719038293703 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0146_text_document +0.0002699463758978633 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0598_text_document +0.00036803059610704023 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0503_text_document +0.00019238917001147227 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0165_text_document +0.0003032499694911512 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0208_text_document +0.0002324778915243651 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0025_text_document +0.00034461966416924293 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0442_text_document +0.00029845740389518305 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0325_text_document +0.0003573488582687763 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0429_text_document +0.00024133252840622868 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0056_text_document +0.00031262504271566886 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0212_text_document +0.00023948892639413417 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0002_text_document +0.0002463627682743308 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0245_text_document +0.0003131758857388708 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0068_text_document +0.0002694331967435558 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0381_text_document +0.0003258932251973233 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0452_text_document +0.0002642164231729589 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0500_text_document +0.0002714159542817306 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0396_text_document +0.00024335790850090816 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0328_text_document +0.0002516843829910143 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0384_text_document +0.00026719508044412176 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0389_text_document +0.00028487275709931573 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0586_text_document +0.00025749821847028987 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0488_text_document +0.0003655946095122969 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0298_text_document +0.00031665029873002615 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0463_text_document +0.00024262656469095593 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0118_text_document +0.00034363087609274955 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0479_text_document +0.0003008713605776459 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0274_text_document +0.000299235651831285 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0272_text_document +0.0003745736454417228 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0492_text_document +0.00025495685914334683 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0113_text_document +0.00036183034603409704 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0415_text_document +0.0002652065517219049 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0405_text_document +0.00030825924668055317 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0198_text_document +0.0003007499820493098 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0313_text_document +0.0003553883704883992 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0331_text_document +0.0003173906831139949 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0314_text_document +0.00025410681301171756 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0053_text_document +0.0003848071363766293 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0507_text_document +0.0003368315781090189 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0438_text_document +0.0002453724684716242 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0029_text_document +0.0002648420840083915 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0399_text_document +0.00029480381163674485 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0564_text_document +0.0002816700525268618 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0601_text_document +0.000255851381162692 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0241_text_document +0.0003559575342795207 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0343_text_document +0.00028596502870150385 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0223_text_document +0.0003019283845909421 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0316_text_document +0.0002310727115490268 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0123_text_document +0.0002650814902223781 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0535_text_document +0.0002553585760163846 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0250_text_document +0.0003356609205694754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0335_text_document +0.0002547220370698787 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0253_text_document +0.00025944708152734894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0079_text_document +0.00030116112758130255 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0565_text_document +0.0003230808526201062 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0192_text_document +0.00024257541419356256 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0098_text_document +0.00019567998656160198 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0247_text_document +0.0001695341620763688 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1166_text_document +0.0001624146809319408 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1192_text_document +0.00014333841136634416 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0818_text_document +0.00012708025826068252 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0166_text_document +0.00016352699396236217 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0529_text_document +0.00021728611911473635 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1356_text_document +0.00015636251730253634 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0858_text_document +0.00014324586822581748 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0823_text_document +0.00015897461323219583 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1339_text_document +0.00017108674190078807 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0627_text_document +0.00015013827861652795 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1370_text_document +0.0001972006042068891 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0341_text_document +0.00012269303700377157 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0185_text_document +0.00014744939263600657 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1259_text_document +0.0001618181386509441 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0981_text_document +0.00016515493943054124 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0515_text_document +0.00015677010982666643 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0545_text_document +0.00017095114976158616 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1167_text_document +0.00014364160317566655 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0077_text_document +0.00018828479047274606 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0665_text_document +0.00015465902478345193 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1195_text_document +0.0001390587027239324 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0085_text_document +0.00018195456827202163 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0566_text_document +0.00011920421004161942 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0230_text_document +0.00021205550061396497 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0311_text_document +0.0001436159963386609 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0055_text_document +0.00014247480527621313 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0810_text_document +0.00014589130200546316 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1270_text_document +0.0001657398838382825 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0966_text_document +0.00016088126731396651 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0517_text_document +0.00016628319575737334 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0843_text_document +0.00020520743975773465 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0348_text_document +0.00014665699381439506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0797_text_document +0.00015884407950940795 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0736_text_document +0.00017445628170628633 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0943_text_document +0.00015491157489278548 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1054_text_document +0.00015361878566936873 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1105_text_document +0.00018148660349496672 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0556_text_document +0.00016318924883005205 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0849_text_document +0.00017735550904648365 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0492_text_document +0.00015506323262027988 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1121_text_document +0.00014514138359296612 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0817_text_document +0.00016129591328185056 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0874_text_document +0.00017893089540432167 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0579_text_document +0.00015469510066293024 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1250_text_document +0.00014147217452585304 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0146_text_document +0.00018002538811588936 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0589_text_document +0.00012896971047756227 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0169_text_document +0.0001493805247240001 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1084_text_document +0.0002033040613836003 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1041_text_document +0.00015483383064936307 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0526_text_document +0.00018457049518666235 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0551_text_document +0.00012464750991886638 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0193_text_document +0.00020263121348102227 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1402_text_document +0.00015732724749731566 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1106_text_document +0.00014994880426810227 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0782_text_document +0.0001427354007022011 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0659_text_document +0.00017383550442341477 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0588_text_document +0.00014794741120707251 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0990_text_document +0.00013965112476129038 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0833_text_document +0.00016830993685690627 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0845_text_document +0.0001578459852914722 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1098_text_document +0.00016544832622877084 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0402_text_document +0.00016619317471768417 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0878_text_document +0.0001705972129133786 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0930_text_document +0.00013964228274235854 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0046_text_document +0.00016071569403386412 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0440_text_document +0.0001652980777911164 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1293_text_document +0.00017149553387813454 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0393_text_document +0.0001440150819055646 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0049_text_document +0.0001596228732463621 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0305_text_document +0.00016054511332990351 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0868_text_document +0.00015086161253385788 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1126_text_document +0.00016278072512770076 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0531_text_document +0.0001460824476010622 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1087_text_document +6.535045240771344e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1442_text_document +0.00014918602129116153 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0997_text_document +0.00016976775852345032 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0366_text_document +0.00012941294747512296 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0165_text_document +0.00021543528225174234 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1078_text_document +0.00016961153312648427 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0957_text_document +0.00014846894027958484 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1002_text_document +0.00016792533024266346 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0269_text_document +0.00015032152307353972 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0460_text_document +0.00019090043692832962 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1397_text_document +0.00017126164591608773 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0250_text_document +0.00017052271730189918 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0951_text_document +0.0001529819080510649 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1246_text_document +0.0001593904679041581 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0876_text_document +0.0001535686933803246 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0302_text_document +0.0001828537136916407 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0564_text_document +0.00018216559554926296 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0584_text_document +0.0001518375491324927 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0622_text_document +0.00017989122128631265 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0694_text_document +0.0001937399691335672 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0335_text_document +0.00016284056669703547 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1189_text_document +0.0001274714871382623 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0215_text_document +0.0002039840022153184 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1390_text_document +0.00013201968828389528 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0204_text_document +0.0002118653491475742 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1038_text_document +0.00016037864519813518 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0713_text_document +0.0001810568673406439 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0567_text_document +0.0001520426436740549 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1130_text_document +0.0001363916939036548 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0221_text_document +0.00016600137546972103 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0538_text_document +0.000157091798621546 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1232_text_document +0.00015120338743186473 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1265_text_document +0.00014339602031912966 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0628_text_document +0.00013718423915964128 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0090_text_document +0.00016701812935300498 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0968_text_document +0.00017269235601396706 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0248_text_document +0.00015271729639722688 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0885_text_document +0.00016313458661522843 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0977_text_document +0.0001547258823418314 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0749_text_document +0.00016083316892410838 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0527_text_document +0.0001583053763245904 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0985_text_document +0.0001733472209179004 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0934_text_document +0.00014798010488118723 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0993_text_document +0.00013889389681410628 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0088_text_document +0.00019105504252193975 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0674_text_document +0.0001260784262432769 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0171_text_document +0.00016818286634093132 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1160_text_document +0.00014580962546884019 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0640_text_document +6.470029145424178e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1419_text_document +0.0001681159453803032 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0488_text_document +0.00017801942490344936 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0704_text_document +0.00015673746308375355 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0887_text_document +0.0001539147990354861 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1055_text_document +0.0001456873195980062 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1258_text_document +0.00017094160309773435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0924_text_document +0.00016796911953592162 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0390_text_document +0.000151517525944017 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0612_text_document +0.00014541128245282718 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1276_text_document +0.00015875350296674313 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0744_text_document +0.00016829864426209354 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0399_text_document +0.00013812489725796757 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0053_text_document +0.00014239921197099827 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0025_text_document +0.0001675238599840661 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0371_text_document +0.0001302247584407276 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0161_text_document +6.682470879450296e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1444_text_document +0.00014844153787284714 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0051_text_document +0.00017069965470177095 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0367_text_document +0.00013975554284966803 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0036_text_document +0.00020383835732242943 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1398_text_document +0.00016109580427796109 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0434_text_document +0.00016807102499995444 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0965_text_document +0.0001581069814377221 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1226_text_document +0.00014313504064392658 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0093_text_document +0.00017857519389779866 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0702_text_document +0.00016332263700781356 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0949_text_document +0.00015031787169065788 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1343_text_document +0.0001814913146656133 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1480_text_document +0.00017181771454811032 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0503_text_document +0.00016211098122773695 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0979_text_document +0.00014013915195024086 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1039_text_document +0.00014875652926934062 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1034_text_document +0.00015743134722066542 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0725_text_document +0.00017191287320061663 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1185_text_document +0.00017494945169756188 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1288_text_document +0.0001555344866428178 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0742_text_document +0.00013537753432305735 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0242_text_document +0.00014710662157811443 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1071_text_document +0.00016411831898287708 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0975_text_document +0.00016201729549295002 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1211_text_document +0.00015221296332320523 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0435_text_document +0.00015852377744129056 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0841_text_document +0.0001637600734893311 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1303_text_document +0.00020336175111220435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1380_text_document +0.00018339831779819768 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0558_text_document +0.00015777501854536213 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0522_text_document +0.00013195539998318593 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0181_text_document +0.0001386037527814194 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0098_text_document +0.00020002678200326375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0318_text_document +0.0001502810207694568 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1101_text_document +0.00016701450707649348 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1183_text_document +0.00014537187039666396 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0054_text_document +0.00016542204453010793 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0962_text_document +0.00016147214242670993 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0072_text_document +0.0001666473715046599 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1283_text_document +0.00014610222865137749 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1269_text_document +0.00017003850913949867 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0379_text_document +0.00013929800966037666 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0109_text_document +0.00015355042744068418 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1257_text_document +0.00018749589298954898 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0683_text_document +0.00018128248525778134 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0932_text_document +0.00015847515536239644 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0724_text_document +0.0001675613909564323 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0259_text_document +0.00015830816489537683 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0752_text_document +0.0001559684384825985 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0748_text_document +6.807932805918992e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1456_text_document +0.00014366963847673678 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0038_text_document +0.00016516766636021026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1311_text_document +0.00013191015853023994 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0224_text_document +0.00016218733445741242 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1181_text_document +0.00016199692216184222 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0292_text_document +0.0001498689357109041 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0006_text_document +0.00021774546028345258 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1077_text_document +0.0002018860545241583 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1376_text_document +0.00013948006210998777 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0082_text_document +0.0001555107547975781 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1107_text_document +0.00016590320374380407 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1305_text_document +0.00015357096232342907 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1263_text_document +0.00017149756229882957 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1484_text_document +0.00014945745805040604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0470_text_document +0.0001370605666431844 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0232_text_document +0.00020871806603751215 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0312_text_document +0.00017949882612324094 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1478_text_document +0.00012967198288991124 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0176_text_document +0.00014407639684388027 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0825_text_document +0.0001738289893346974 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0587_text_document +0.00016432007712212363 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0726_text_document +0.00021987744618253408 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1069_text_document +0.0001658824793312056 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0478_text_document +0.00018343459381217617 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1470_text_document +0.0001540243960271133 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0751_text_document +0.00014076985382851758 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0034_text_document +0.00018327802049255027 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0557_text_document +0.00013541979587031706 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0087_text_document +0.00018283903212254103 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0554_text_document +0.00018356592152967213 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0680_text_document +0.00013528981629672019 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0095_text_document +0.00016195981108835402 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1193_text_document +0.0001692203965318927 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1290_text_document +0.00016769038226722118 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0956_text_document +6.577591782266853e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1433_text_document +0.00020548459039080355 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1409_text_document +0.00016625397566908732 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0546_text_document +0.0002191514358329144 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1137_text_document +0.00022811276854586046 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1042_text_document +0.00021400238398657707 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1361_text_document +0.00016277096569402517 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0983_text_document +0.0001583899704928049 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1099_text_document +0.0001880639175708719 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0675_text_document +0.00015133651117035432 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0000_text_document +0.0001625341698025103 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0540_text_document +0.00017771535471350786 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0593_text_document +0.00016815517512679766 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1182_text_document +0.00016079574219316162 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0984_text_document +0.00016439910543030416 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0463_text_document +0.00016528739620445078 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0406_text_document +0.00016489000174022887 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0963_text_document +0.0001557518593344314 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1112_text_document +0.00017657136921387344 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0572_text_document +0.00017613819918473885 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0894_text_document +0.00016818136837819556 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0901_text_document +0.00015958987474506617 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0539_text_document +0.00018218170919691117 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1462_text_document +0.00016718904263673248 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0504_text_document +0.00015987218216956836 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1196_text_document +0.00017189442585383062 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0916_text_document +0.0001634813294517073 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0840_text_document +0.0002205095381720346 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1059_text_document +0.00017396541314894736 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0895_text_document +6.507565239609069e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1439_text_document +0.00015962194770891035 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0521_text_document +0.00012643543528015894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0227_text_document +0.00012965619956572215 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0157_text_document +0.00013223758759774493 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0134_text_document +0.00014136760030097697 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0091_text_document +0.0001381774119190453 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0080_text_document +0.00019798512467862197 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1405_text_document +0.00016156745631319154 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0856_text_document +0.00019974862821575546 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0355_text_document +0.00018200165470784005 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0904_text_document +0.00015113105990653198 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1256_text_document +0.00015548753626235857 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0886_text_document +0.00017554167495420438 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0703_text_document +0.00015653678368525705 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0870_text_document +0.00016896257320564437 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1307_text_document +0.0001903984601165236 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1463_text_document +0.00014624534535139798 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1278_text_document +0.00015877224250538676 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1176_text_document +0.00015006091298155116 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0999_text_document +0.00019757096381691082 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0352_text_document +0.00016078369621718087 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1216_text_document +0.00016656194994838216 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0506_text_document +0.00016849271470946895 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0942_text_document +0.00012907462743559026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0164_text_document +0.00014130711072004757 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0071_text_document +0.00019625342379966053 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0362_text_document +0.0001551313555160629 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0285_text_document +0.00014941428518043363 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0819_text_document +6.68063248499656e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1447_text_document +0.0001529851241921758 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0794_text_document +0.00015413689147171587 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1111_text_document +0.00015819723420022034 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0452_text_document +0.0001421845421104754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1008_text_document +0.0002337347761220641 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1064_text_document +0.00015326757579474523 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0786_text_document +0.00016886422097510493 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0275_text_document +0.00014601039985789424 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1004_text_document +0.00014884931563984607 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1020_text_document +0.00015986533743428418 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1198_text_document +0.00016926153082778508 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0258_text_document +0.00012837862440004137 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0179_text_document +0.00015414960057655343 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0473_text_document +0.00017497436201235553 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0706_text_document +0.0001432922811537633 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0829_text_document +0.00021642532731730042 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1348_text_document +0.00016753124352564838 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0489_text_document +0.00015608314375504165 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0721_text_document +6.782620642709544e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1418_text_document +0.00014090986584903154 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0657_text_document +0.00012660624973760565 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0182_text_document +0.00014735197135805754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1268_text_document +0.00018616875148016776 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0677_text_document +0.0001272812571523608 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0211_text_document +0.00015478061878918103 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1199_text_document +0.00017175987703134661 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1161_text_document +0.00016736774334113506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1159_text_document +0.00021542375861101283 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1060_text_document +0.000145172522445389 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0203_text_document +0.00015175519683168932 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0634_text_document +0.00014496879545197283 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0214_text_document +0.00017092840392284674 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1292_text_document +0.0001278331608658506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0168_text_document +0.00015032268010816978 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1342_text_document +0.0001385868171608466 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0105_text_document +0.00015134071512301172 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1346_text_document +0.0001269932638322507 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0145_text_document +0.00013140246671645566 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0174_text_document +0.00020348737894498417 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1393_text_document +0.00016800561035040085 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1295_text_document +0.00016164160174570342 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0888_text_document +0.00018931294863807786 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0673_text_document +0.00019734558642218287 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1412_text_document +0.0002303194107055354 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1337_text_document +0.0002167143077720647 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1089_text_document +0.0002143413186443493 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1031_text_document +0.00015854607653108938 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0542_text_document +0.00013590494333364677 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0135_text_document +0.0001709192279703633 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0604_text_document +0.0001684909541125075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0905_text_document +0.00014641981954006535 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1028_text_document +0.00015196906818488852 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0760_text_document +0.00015046293445613942 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0052_text_document +0.00021207499511319207 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0361_text_document +0.000170520846597118 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0408_text_document +0.00016506154746702737 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0514_text_document +0.00022220671190117854 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1144_text_document +0.00013485250436339217 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0086_text_document +0.00014243329417692134 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0016_text_document +0.0001997902496484977 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0363_text_document +0.00013773786894858352 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0026_text_document +0.00014210492964421037 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0013_text_document +0.00014261494951636302 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0048_text_document +0.0001994698002434822 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0343_text_document +0.0001447668168714561 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0991_text_document +0.00015128160843312126 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1151_text_document +0.0002074354130511704 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0325_text_document +0.00012703221289405721 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0209_text_document +0.00016873053428782402 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1325_text_document +0.00014384384709797832 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0042_text_document +0.00014223883045509972 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0162_text_document +0.00017796337347992502 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0902_text_document +0.0001491404477097102 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0625_text_document +0.00016202535163988179 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0482_text_document +0.00016604798605022845 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0502_text_document +0.00012837092768293909 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0225_text_document +0.0001660767481080349 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1190_text_document +0.00017106130812258926 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0498_text_document +0.0001266757182953492 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0198_text_document +0.00015520576268027733 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1146_text_document +0.00016248205157470968 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1197_text_document +0.00016174543116338102 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0889_text_document +0.0001607107134600685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0873_text_document +0.000142616278481646 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1013_text_document +0.00015710288183099663 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1212_text_document +0.0001393562142784163 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0107_text_document +0.00014454298256561947 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0005_text_document +0.00016005107736770166 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0528_text_document +0.000135126504062645 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0101_text_document +0.0001508209689849079 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0792_text_document +0.0001451914251150852 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0638_text_document +0.00015254108755913753 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1253_text_document +6.510273756595438e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1449_text_document +0.00016128557059261363 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0761_text_document +0.0001664509064378366 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1330_text_document +0.0001645218431584474 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0268_text_document +0.0001644207050646909 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0372_text_document +0.00015724536834425392 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0757_text_document +0.0001423864254748038 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0031_text_document +0.00014675658024529065 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0815_text_document +0.0001383859658316972 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0074_text_document +0.0001537984956202417 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0771_text_document +0.0001689143410167348 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0936_text_document +0.00021971795320063967 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1355_text_document +0.0001615226814646947 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0764_text_document +0.00016998404232092888 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0516_text_document +6.971484231182006e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1460_text_document +0.000165053699351673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0394_text_document +0.0001284057718439998 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0178_text_document +0.0001469131892145795 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1005_text_document +0.00015600815153021962 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1103_text_document +0.00017596769632339667 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0899_text_document +0.0001629273325614891 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1204_text_document +0.00014895594062286423 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1061_text_document +0.00021388747632332592 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0327_text_document +0.0001272010357086257 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0233_text_document +0.00013763895692808363 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0057_text_document +0.00017734499397533223 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0922_text_document +0.00014915400067365785 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0996_text_document +6.676606803216924e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1457_text_document +0.0001617530978715898 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0513_text_document +0.00016745873391627768 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1323_text_document +0.00016334347288201646 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0426_text_document +6.651419662021617e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1428_text_document +0.00015587845196441837 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0879_text_document +0.00016753160949877044 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0297_text_document +0.00021020002387965022 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1395_text_document +0.0002065001249653627 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0333_text_document +0.00012663980960827745 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0202_text_document +0.00016757511900986375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1171_text_document +0.00015605009719439 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0746_text_document +0.00015715662663107652 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0111_text_document +6.636371600849109e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1459_text_document +0.00014498611080914514 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0806_text_document +0.0001542272198205248 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1332_text_document +0.00014059323118358123 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0003_text_document +0.00021921136810582192 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1082_text_document +0.00015167674681519302 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0745_text_document +0.00016013437912281925 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1187_text_document +0.0001439794004286864 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0803_text_document +0.00020770381846091124 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0326_text_document +0.0001522225828657984 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1131_text_document +0.0001841469798985223 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0678_text_document +6.985047980280837e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1458_text_document +0.0001946928000123715 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1465_text_document +0.00019167154590661272 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0662_text_document +0.0001579237297605254 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0715_text_document +0.00014555179650182237 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0043_text_document +0.00014278974929024318 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1029_text_document +0.00014073888645548604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0108_text_document +0.00015833078302709586 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0450_text_document +0.00015175492630038637 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0465_text_document +0.00013753021891282864 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0125_text_document +0.0001507527795280453 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1052_text_document +0.00014715051646657675 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0988_text_document +0.00016685078464566375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0487_text_document +0.00016411180976055992 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0872_text_document +0.00014019467269017514 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0096_text_document +0.00016270832291858043 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0865_text_document +0.00014389487056524366 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0643_text_document +0.00016448630021886695 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0507_text_document +0.0001428288066548232 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0032_text_document +0.00015068597679794492 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1228_text_document +0.00015437630829034905 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1076_text_document +0.00015394531108560747 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1229_text_document +0.00012456910277221792 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0219_text_document +0.00022499754937441147 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1341_text_document +0.00016328521704579013 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0384_text_document +0.00017564631641705234 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0583_text_document +0.00013090562187669734 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0236_text_document +0.00014405833194126315 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1267_text_document +0.00018000171719095975 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0682_text_document +0.00016551054323893732 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1324_text_document +0.00018462225150269493 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0672_text_document +0.00014938124380928987 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1237_text_document +0.00016278567817454143 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0256_text_document +0.0001801705673068524 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0689_text_document +0.0001447484970060597 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0801_text_document +0.0002061767466472168 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1392_text_document +0.00014733469571190217 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0630_text_document +0.00015454726196198582 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0041_text_document +0.00021817938865763232 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1027_text_document +0.0001298786214228879 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0173_text_document +0.00018138691914031344 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1476_text_document +6.537426029052535e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1455_text_document +0.00014073700672547374 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0172_text_document +0.0001256698111604605 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0228_text_document +0.00014477850589954592 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0812_text_document +0.00016291893052257454 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0980_text_document +0.00015081447600800676 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0781_text_document +0.00014969267700841283 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1280_text_document +0.00012692990964296264 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0141_text_document +0.0001509226797295792 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1046_text_document +0.00014673420805111974 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0656_text_document +0.00013712771887536008 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0137_text_document +0.00012288465935720468 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0234_text_document +0.00017494555279771646 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0909_text_document +0.0001638807582030245 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0410_text_document +0.0001597837545344341 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0836_text_document +0.00016533452934584025 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1313_text_document +0.00016323215726075254 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1326_text_document +0.00015298318038302255 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1113_text_document +0.00020767747087806112 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1040_text_document +0.00021672481833060058 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1345_text_document +3.750780482356549e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1492_text_document +0.00014608217797228235 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0831_text_document +0.00016476762411880743 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0293_text_document +0.00021699554593230955 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1063_text_document +0.00021105380532881085 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0309_text_document +0.0002047160064465581 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0353_text_document +0.00017221227712043642 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0597_text_document +0.00014407040837739895 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0011_text_document +0.00012338500621226977 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0189_text_document +0.00017444094648482255 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1174_text_document +0.00015546253659777677 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0474_text_document +0.00014381681585387058 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1079_text_document +0.00013897398671509773 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0066_text_document +0.0001775141529797601 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0697_text_document +0.0001591753095530007 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0734_text_document +0.00015104252960939366 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0756_text_document +0.00017569007412200155 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1488_text_document +0.00014319238402928628 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0238_text_document +0.00012505086780455324 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0235_text_document +0.0001612298998082119 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0274_text_document +0.0001838767026558464 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0417_text_document +0.0002050680150541361 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0316_text_document +6.618274004332223e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1451_text_document +0.00016022951634040166 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0864_text_document +0.00013274827939835476 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0127_text_document +0.00017726136663500188 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0900_text_document +0.0001245682131100599 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0244_text_document +0.00015882962379671717 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0291_text_document +0.0001617639209287533 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1205_text_document +0.00022875188600089843 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1344_text_document +0.00022135260148234352 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1351_text_document +0.0001486995466675951 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0441_text_document +0.00012704459393174345 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0213_text_document +0.00022842395422801987 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1143_text_document +0.0001635885246037017 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1320_text_document +0.0001582665195457721 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0437_text_document +0.0001775072258892419 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0927_text_document +0.00020242381998746212 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0324_text_document +0.0001545652142748447 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1125_text_document +6.81810618182006e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1421_text_document +0.0001597601607752306 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0735_text_document +0.0001852365073791873 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0573_text_document +0.00014051207788473263 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0002_text_document +0.00019277743184432014 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1416_text_document +0.00014805940507445537 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0776_text_document +7.404378291311911e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1430_text_document +0.00016241379871559847 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1322_text_document +0.000128787783253033 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0180_text_document +0.00016198778749979117 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0272_text_document +0.00017479293523689206 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0896_text_document +0.00016144158141470971 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0543_text_document +0.00017984364568296736 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0570_text_document +0.00016364824117358535 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0859_text_document +0.00015835449907818108 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0255_text_document +0.0001598217873096508 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0263_text_document +0.00016255842781872452 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1299_text_document +0.00014000754828630328 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0835_text_document +0.00014040163951593393 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0039_text_document +0.0001553155586782508 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0611_text_document +0.00016963744253436334 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0369_text_document +0.00015530821304263922 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0732_text_document +0.00017145228136049745 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1485_text_document +0.00015677612604573347 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0045_text_document +0.0002281728955381295 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1138_text_document +0.0001438605015826016 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0067_text_document +0.00014531584582320225 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0642_text_document +0.00016441162523091893 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0961_text_document +0.00017332479727991208 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0249_text_document +0.00015197205734143667 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1221_text_document +0.0001731814495339748 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0496_text_document +0.00016262394941726692 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1312_text_document +0.00017293102712287865 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0599_text_document +0.0001666866506642948 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0497_text_document +0.00019375975153766717 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1413_text_document +0.00019848025607825147 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1382_text_document +0.00019589076429120272 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0661_text_document +0.00014782024524927844 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1251_text_document +0.00017741559068696627 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0560_text_document +0.00021151104610061383 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0350_text_document +0.00013945420098709522 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0018_text_document +0.00019887995129951757 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0354_text_document +0.00015547221818883392 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0007_text_document +0.00017258790250155727 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0594_text_document +0.00014928346919354236 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1003_text_document +0.00020395282974390957 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1375_text_document +0.00016193806231202298 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0298_text_document +0.00012846951975899564 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0199_text_document +0.00018501450203643792 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0555_text_document +0.0001666431623624946 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0419_text_document +0.00015879678007813504 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0400_text_document +0.0001532191493377357 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0608_text_document +0.0001525105010508594 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0789_text_document +0.00015180606799817945 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0618_text_document +0.00012173158477293636 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0212_text_document +0.00017380279337573016 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0493_text_document +0.00014661984145047447 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0633_text_document +0.00017498764263995404 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0920_text_document +0.00014684899748384315 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0621_text_document +0.00013467484359124581 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0129_text_document +0.00020840519038082484 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1033_text_document +0.0001598348481728714 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0982_text_document +0.0001579504038311114 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0300_text_document +0.00014348890732096214 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1000_text_document +0.00016758977485705474 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1284_text_document +0.00012564178714092916 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0241_text_document +0.0001291169546414064 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0163_text_document +0.00016162437122570363 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0971_text_document +0.000167053672743847 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0455_text_document +0.000153169062689461 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1124_text_document +0.00018220285305712615 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1464_text_document +0.00020403943721555701 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0332_text_document +0.00015665194631128744 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1225_text_document +0.00012614521656368453 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0160_text_document +0.00014013351682155804 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0132_text_document +0.00017097003189888156 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0953_text_document +0.00020205923223837476 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1396_text_document +0.00021572515124051073 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0319_text_document +0.00014717040017889609 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0014_text_document +0.0001590281915167615 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0867_text_document +0.00015492644283290785 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0765_text_document +0.00016331970448459069 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0731_text_document +0.00013421158682600656 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0121_text_document +0.00015029220415897293 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1340_text_document +0.00018799263088031847 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1414_text_document +0.00015276640696675555 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1149_text_document +0.00017523242315344403 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0897_text_document +0.00015019996810193524 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1140_text_document +0.00018958357448030594 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1406_text_document +0.00017626847839600905 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1491_text_document +0.00014560654964584956 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0811_text_document +0.00014687591072305394 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0047_text_document +0.00017120995999685788 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0907_text_document +0.00012358194307305004 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0184_text_document +0.0001669042859901341 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1202_text_document +0.00017927650173857129 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0595_text_document +0.00022615351090448548 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1333_text_document +0.00013263308292770764 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0149_text_document +0.00015078826876750059 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1234_text_document +0.00018088433380229892 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1289_text_document +0.0002049892875117827 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0331_text_document +0.0001888977519054645 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1415_text_document +0.00015011406347016157 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0769_text_document +0.0001677857169117344 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0935_text_document +0.00012791852714258775 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0216_text_document +0.00016505191077649506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0940_text_document +0.0001571941770076897 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0762_text_document +0.00015430477295467815 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0445_text_document +0.00016414354518076008 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0378_text_document +0.00016197041415550887 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0280_text_document +0.00016713884836257664 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0376_text_document +0.0001566985564405333 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1047_text_document +0.00014945756703007185 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1045_text_document +0.00022406061576333915 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1056_text_document +0.00021167575070222058 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0356_text_document +0.00013677667461372767 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0210_text_document +0.00020680395688920254 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1386_text_document +0.00015760638526064722 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1110_text_document +0.00015453551359300138 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0476_text_document +0.00016397827492279263 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1163_text_document +0.00017322755740910864 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0598_text_document +0.00016647054827396185 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0511_text_document +0.00015954468834603007 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0279_text_document +0.00015699856956345393 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1254_text_document +0.00014753457389550566 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0115_text_document +0.00016888120307561802 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0365_text_document +0.00016542923944435972 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1298_text_document +0.0001646539804391752 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0839_text_document +0.0001738747195039087 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1227_text_document +0.00016771993042265853 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1282_text_document +0.00014067444517710913 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0030_text_document +0.00016285517606428614 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0254_text_document +0.00014496445138260225 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0658_text_document +0.00016065034278655918 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0978_text_document +0.00016296100118619596 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0851_text_document +0.00013435857715522536 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0130_text_document +0.00020346664054327743 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0357_text_document +0.00012340782876547375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0152_text_document +0.00016711343759871717 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0952_text_document +0.00014642480417521495 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0834_text_document +6.437624301268493e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1436_text_document +0.00016874334334709629 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1302_text_document +0.00016096287210000209 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1210_text_document +6.894858552341982e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1445_text_document +0.00016183045290192 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1328_text_document +0.0001266295474692283 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0188_text_document +0.00015582561352317403 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1152_text_document +0.00020064552251710147 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0340_text_document +0.0001669433724610845 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0534_text_document +0.00015678773700557408 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0986_text_document +0.00016617707551040578 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0892_text_document +0.00014355808368684807 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0062_text_document +0.00016776954911269944 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1173_text_document +0.00013215274478441298 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0009_text_document +0.00016045529919363785 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0537_text_document +0.00013995190683390273 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0058_text_document +0.00015982388364521005 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0286_text_document +0.00021515127560185033 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1350_text_document +0.0001571023238460824 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0837_text_document +0.00013752792560160014 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0068_text_document +0.00016843360937336314 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1291_text_document +0.00015891629060925923 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1108_text_document +0.00013203126675290382 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0158_text_document +0.00016411457554263775 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0425_text_document +0.00021230579644619457 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1007_text_document +0.00015986075758075047 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0717_text_document +0.0001701948205017912 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1314_text_document +0.00015377327528875894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1097_text_document +0.0001371169603485991 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0131_text_document +6.898654632241831e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1425_text_document +0.00013787821806147992 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0050_text_document +8.926989271708098e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1432_text_document +0.0001697487861338437 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0257_text_document +0.00017584653056488438 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1487_text_document +0.00014710604306079943 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0793_text_document +0.00014185002801090756 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0655_text_document +0.0002048177532921539 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0339_text_document +0.00016120729340929332 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1109_text_document +0.0001223960473332774 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0151_text_document +0.0001453530879881725 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0830_text_document +0.00016900557843194865 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0912_text_document +0.0001762587422023036 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0700_text_document +0.0001701060296011042 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0959_text_document +0.00017187900714897894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1156_text_document +0.00014750294655244822 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0798_text_document +0.00021646525493979158 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1371_text_document +0.00014394646036576203 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0023_text_document +0.00018149198320960877 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1483_text_document +0.00021898530050654377 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1357_text_document +0.00014039681214543667 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0122_text_document +0.00015372532175386033 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0462_text_document +0.00014244364882458953 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1021_text_document +0.00018340422661701435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0562_text_document +0.00017297231732963866 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0505_text_document +0.00014815439743707823 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0787_text_document +6.566845033432497e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1420_text_document +0.00019458570097908857 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1399_text_document +0.00014141643718037182 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1024_text_document +0.00016580103135856077 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1296_text_document +0.00015323268199491331 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0454_text_document +0.0001652102562002521 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0844_text_document +0.00013734197357525123 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0133_text_document +0.00020796648585433443 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1377_text_document +0.0001451504261976597 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1037_text_document +0.00017540222926694344 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0705_text_document +0.0001441876443246492 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0623_text_document +0.00016583482656213415 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0950_text_document +0.00016662819288852212 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0547_text_document +0.00012275448077726155 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0240_text_document +0.00014899869630152795 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1011_text_document +0.00017669708053004115 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0893_text_document +0.0001380068262512076 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0117_text_document +0.00016438704208075348 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1178_text_document +0.00015566951718701304 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1120_text_document +0.0001968298289849741 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0346_text_document +0.00021831789053134852 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1354_text_document +0.00015828325931882951 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0880_text_document +0.00014054355631147654 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1019_text_document +0.00016980283350479704 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0477_text_document +0.0001554936668762301 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1231_text_document +0.00020725872023506206 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0660_text_document +0.00015766043466227753 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0471_text_document +0.00016385202931294238 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1168_text_document +0.00015936796236219736 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1214_text_document +0.00020362582892362142 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1378_text_document +0.00014733969185422145 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1142_text_document +0.00015761417290695306 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1102_text_document +0.0001462337959887598 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1015_text_document +6.628581021343156e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1438_text_document +0.00020969954548184154 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1372_text_document +0.00016810696279339656 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0945_text_document +0.00014940505169210177 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1248_text_document +0.0001528759123522868 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1119_text_document +0.00014503240257336312 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0064_text_document +0.00016497435403224923 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0973_text_document +0.00016079571489183747 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0509_text_document +0.00022369993639058988 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1135_text_document +0.0001420789135129733 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0019_text_document +0.00019132102172392381 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0663_text_document +0.00013840860111135537 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1025_text_document +0.00016229722476905113 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0261_text_document +6.379280220844888e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1431_text_document +0.00020113552848797584 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1403_text_document +0.000137631385661694 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0004_text_document +0.0001982410829408586 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1374_text_document +0.00017079185533477647 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1164_text_document +0.00015664816631886207 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0740_text_document +0.00016302101603996825 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1217_text_document +0.00017408551776591026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0387_text_document +0.00016607109177005895 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0911_text_document +0.0001657745682527523 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1203_text_document +0.00014511872575023014 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0995_text_document +0.00016493395371327803 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0535_text_document +0.00016439024152518683 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1319_text_document +0.00014733104613406654 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0795_text_document +0.0001690933348155181 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1184_text_document +0.00017257110857711312 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0915_text_document +0.00018430963487014087 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0576_text_document +0.00020464869311400685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0329_text_document +0.00017025858150337216 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0929_text_document +0.00013900029887391568 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0142_text_document +0.00016748130895251811 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0413_text_document +0.00016329165429188912 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0520_text_document +0.00021736941680849996 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1080_text_document +0.000152394024802439 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1334_text_document +0.0001613925886244472 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1207_text_document +0.00016307955820165195 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0869_text_document +0.0001441486009491774 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1012_text_document +0.00014737653916533006 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0475_text_document +0.00013080649821452762 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0167_text_document +0.00017285284583263907 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0494_text_document +0.0001698093675464998 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0264_text_document +0.00021815398376017308 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1358_text_document +0.00014901447037485513 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0641_text_document +0.00016660249038406657 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1318_text_document +0.0001633499708227947 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1194_text_document +0.00015045744553620076 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0059_text_document +0.00018084233263295714 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0578_text_document +0.00014157085008529747 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1095_text_document +0.00017857213682638632 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0692_text_document +6.764988138915539e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1422_text_document +0.00012506935057537142 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0195_text_document +0.00015870181027859 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0431_text_document +0.00017100696021143952 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0928_text_document +0.00016895043878239859 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0388_text_document +0.00018224812542148869 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0690_text_document +0.00015411896578328878 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0467_text_document +0.00017368407706494485 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0415_text_document +0.00014234118808321183 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0119_text_document +0.00012867800914124502 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0187_text_document +0.0001561855555345405 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0424_text_document +0.00016278956850041728 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0716_text_document +0.00016733080487519212 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0854_text_document +0.00016728745150059723 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0252_text_document +0.0001402355472850419 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0635_text_document +0.0001548607145565727 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0755_text_document +0.00016398300482079378 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0294_text_document +0.00015930313976780547 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0719_text_document +0.00014962011798590175 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0785_text_document +0.00017733082052674487 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0914_text_document +0.00021601718154650243 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1364_text_document +0.00014373735884721167 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0021_text_document +0.00015452708357587032 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1242_text_document +0.00012337326055085463 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0220_text_document +0.00013280279359938795 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0139_text_document +0.00016483325402012335 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1220_text_document +0.000205812101362281 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1383_text_document +0.00013902944382715256 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0102_text_document +6.614313574973613e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1424_text_document +0.00017765117832566445 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0921_text_document +0.00016689470232283466 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1215_text_document +0.0001598119671812959 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0722_text_document +0.00017121358295026171 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0908_text_document +0.00014648650042954557 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0805_text_document +0.00016928267062664605 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0414_text_document +0.0001293478705948442 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0190_text_document +0.00013665330263746718 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0089_text_document +0.00015126314438965768 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1150_text_document +0.000215044687396649 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1360_text_document +0.00015597172028226583 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0453_text_document +0.00013982208384806586 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0827_text_document +0.00014616926897068722 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1273_text_document +0.0001874960360394623 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0670_text_document +0.00016167036154720208 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0891_text_document +0.00016689874607763962 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1329_text_document +0.00015091546586234206 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1261_text_document +0.00015416991907938216 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1147_text_document +0.0001511400052643602 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1262_text_document +0.00020077913971010278 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0359_text_document +0.0001982479504648514 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0328_text_document +0.00015395483293007372 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1240_text_document +0.0001992056962588949 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1391_text_document +0.00012588895437678978 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0156_text_document +0.00014768793829420554 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0989_text_document +0.00016002155200097292 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1219_text_document +0.00016565416354685452 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0923_text_document +0.00014403017231763416 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0654_text_document +0.00017805971353518544 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0698_text_document +0.0001378765574189532 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0065_text_document +0.00017674772516323008 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1287_text_document +0.0001655522393554671 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0383_text_document +0.00016532529089363128 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0532_text_document +0.00015941871123680576 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0260_text_document +0.00021171945741461394 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1349_text_document +0.00021687351691044025 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1086_text_document +0.00015110950675208554 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0788_text_document +0.0001534703086589531 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0113_text_document +0.00022265313954357227 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1363_text_document +0.00018250509992474986 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0687_text_document +0.00014816523064762675 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1083_text_document +0.0001648579725487556 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0918_text_document +0.00018069138970376776 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0585_text_document +0.0001556370193604137 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0392_text_document +0.00017182194196636866 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0253_text_document +0.00022743636199465037 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1043_text_document +0.00015536366654460163 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0449_text_document +0.00018082637331676116 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0568_text_document +0.00015683285278217664 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0421_text_document +0.0001622994309867993 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0747_text_document +0.00016306397726827083 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1175_text_document +0.00020680722402286928 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1373_text_document +0.00016431500855232457 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1304_text_document +0.00018184329236177466 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1474_text_document +0.00017771217873467545 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1482_text_document +0.00019671885429181591 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1385_text_document +0.00022059042319588803 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0317_text_document +0.00017138124128438447 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0603_text_document +0.00016266615478246287 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0519_text_document +0.0002114083096384253 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1090_text_document +0.00013998264203286362 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0020_text_document +0.00017460079664023868 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1489_text_document +0.0001692744558612773 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0871_text_document +0.00017147492025024807 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0510_text_document +0.0001398439109182769 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0104_text_document +0.00014695579349499111 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1018_text_document +0.00012807718664438787 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0245_text_document +6.316427383145754e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1440_text_document +0.00015227044569712423 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0758_text_document +0.00015118448137770688 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1247_text_document +0.0001362583230994632 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1010_text_document +0.00016646881610647842 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1179_text_document +0.0001815942961628268 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0552_text_document +0.00016369442463502984 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0276_text_document +0.0001851410933126402 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0679_text_document +0.000181472654097742 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0571_text_document +0.0001418308891057867 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1017_text_document +0.00015801449786372084 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0301_text_document +0.00018119300347565183 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1477_text_document +6.557011502609391e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1450_text_document +0.00016703506632736244 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0712_text_document +0.00018720487080187376 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0676_text_document +0.00017695386933917195 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0577_text_document +0.00018219279917528516 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0684_text_document +0.00015972826222397787 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0881_text_document +0.000151526607808466 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0644_text_document +0.00013976918087641902 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0076_text_document +6.811248991716918e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1446_text_document +0.00014594750685791662 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0154_text_document +0.00016393685475270443 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0284_text_document +0.00014289503605711647 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0824_text_document +0.00012500509544238644 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0231_text_document +0.00017692829431610602 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0701_text_document +0.00014991877574473838 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0790_text_document +0.00016308321725598028 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1224_text_document +0.00011997958118530069 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1454_text_document +0.0001690606185783256 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0405_text_document +0.000126472844411905 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0177_text_document +0.0001644991846388191 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0267_text_document +0.00016724128112573786 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0944_text_document +0.0002082625574331926 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1368_text_document +0.00016585627592642267 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0389_text_document +0.00015349598882596164 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0610_text_document +0.00013592663525102173 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0128_text_document +0.00021224078026059514 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1085_text_document +0.0001515362474321984 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1057_text_document +0.00015458670560645793 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0459_text_document +0.00015120628046970142 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0027_text_document +0.00015419195982194006 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0759_text_document +0.0001599400000125394 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0458_text_document +0.00015861314731055416 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0472_text_document +0.00014721703828635846 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0816_text_document +0.00014278396140952358 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0022_text_document +0.00014157968059705966 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0832_text_document +0.00014203238461080756 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0097_text_document +0.00016518095127479354 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0407_text_document +0.00014681716865250086 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0820_text_document +0.00015472200307186366 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0777_text_document +0.00016821273444081198 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0941_text_document +0.00014175740138699298 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0821_text_document +0.00014715021773142656 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0796_text_document +0.00021088768534533313 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1022_text_document +0.00016819933400327033 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0967_text_document +0.00016916342679110154 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0548_text_document +0.00015025051932398203 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1117_text_document +0.0001294676654207488 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0197_text_document +0.00013135310501987096 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0175_text_document +0.00012481653918559684 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0218_text_document +0.00016966096770181028 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0397_text_document +0.00015715333715996273 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0850_text_document +0.00016087188062233635 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0710_text_document +0.0001855137252502281 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0592_text_document +0.00015757612907582822 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0447_text_document +0.00015566574734797269 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0607_text_document +0.00014981399371391565 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1249_text_document +0.00015975710844206455 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0436_text_document +0.00014657158083208687 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0784_text_document +0.00016481357518055052 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0728_text_document +0.00014575951693438 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0646_text_document +0.0001601547549358656 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1104_text_document +0.00015322997577936088 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0443_text_document +0.00016340194577505127 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1154_text_document +0.0001247206585146006 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0194_text_document +0.00016177516907430272 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1206_text_document +0.00015253760810690859 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1233_text_document +0.00012706329375937011 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0170_text_document +0.00015507046995311794 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0299_text_document +0.00017035702691942293 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0605_text_document +0.0001422899277036295 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0033_text_document +0.00013575208908613614 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0037_text_document +0.00014633884990467011 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0631_text_document +0.00016217601027429076 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0877_text_document +0.0001828175622258517 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0565_text_document +0.00015875144995229328 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0626_text_document +0.00015999193615630214 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0723_text_document +0.00014015883259485993 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0650_text_document +0.00017684620577768225 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0590_text_document +0.00017058179804425018 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0938_text_document +0.00015133984193190054 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1132_text_document +0.00017699012504638042 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1479_text_document +0.00020118545538965965 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0313_text_document +0.00014356204418389593 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0651_text_document +0.00017337432052813388 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0926_text_document +0.0001448346019979643 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1274_text_document +0.00014308591563402811 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0828_text_document +0.00017834500041411232 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0602_text_document +0.00014672692243329516 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0637_text_document +0.00015798629649839792 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0423_text_document +0.0001836509845586435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0681_text_document +0.00016952286767650987 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0411_text_document +0.00012067131551215888 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0223_text_document +0.00016217171674621445 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1186_text_document +0.0001765100136620177 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0591_text_document +6.519030577345491e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1441_text_document +0.00016620979558332153 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0533_text_document +0.00021068010638327337 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1366_text_document +0.00018190833539743673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1466_text_document +6.77656699149642e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1423_text_document +0.00015028582467435317 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1075_text_document +0.0001697901643366072 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0939_text_document +0.00015927162952712184 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1327_text_document +0.00014384103100023696 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0647_text_document +0.00015909349132024907 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0420_text_document +0.00020294618411948416 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0330_text_document +0.00014534131773300256 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1016_text_document +0.00017170180688925072 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0600_text_document +0.00016313482646074552 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0606_text_document +0.00016271942733732424 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1096_text_document +0.00012629894513046023 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0246_text_document +0.00016581582010228004 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0530_text_document +0.00021444766452209495 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0344_text_document +0.00018157852863279222 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1471_text_document +0.00015883581352585255 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0774_text_document +0.00015906208351593867 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0733_text_document +0.00016487949862750869 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0283_text_document +0.00017668531772068688 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0561_text_document +0.00015082627785148562 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0466_text_document +0.00020956294138859988 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1032_text_document +0.00015717617279975186 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1188_text_document +0.00016867575113490335 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1285_text_document +0.00013836362251826588 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0029_text_document +0.00021846973756306572 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1081_text_document +0.00014444136139636705 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0063_text_document +0.00015794892076000702 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0739_text_document +0.0001603185290361435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0954_text_document +0.00015790241442091907 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0448_text_document +0.0002247536546873563 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1331_text_document +0.00016016484830309278 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1218_text_document +0.00017826344323617095 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0688_text_document +0.00016344591624982424 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1169_text_document +0.0001518328612018163 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0799_text_document +0.00016015745212618693 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0866_text_document +0.00016772217928462375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0853_text_document +0.00016414184482721638 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0955_text_document +0.00016794059551858604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0910_text_document +0.0001707952950759885 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0490_text_document +0.00015763564099859615 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0861_text_document +0.00016629751203024328 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0948_text_document +0.00022519004424206347 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1336_text_document +0.00020131541623590195 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0349_text_document +0.00014739593235633784 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0808_text_document +0.00015088026723532896 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1277_text_document +0.00017138716747677106 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0601_text_document +0.00014199045022817607 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0044_text_document +0.0002123155378294079 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0315_text_document +0.00015126790565033696 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1230_text_document +0.00014892424536494257 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1255_text_document +0.00016219604515923145 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0271_text_document +0.00015407400817386168 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1051_text_document +0.00018740602831198903 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0481_text_document +0.00016238986966905872 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0970_text_document +0.00015825185377082692 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0412_text_document +0.0001462690665919911 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0040_text_document +0.0001480287233499889 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1068_text_document +0.00021390174927508646 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1091_text_document +0.00020261751949783834 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1394_text_document +0.00015379985797887985 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0862_text_document +0.00015632898789201956 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1461_text_document +6.978510057728695e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1453_text_document +0.0001625132732336281 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0070_text_document +0.00016494060418046228 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0852_text_document +0.00014125778850884905 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0012_text_document +0.0001670062832858936 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0484_text_document +0.00012910130489063797 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0140_text_document +0.00016942714359327365 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0919_text_document +0.00016560471114669703 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0288_text_document +0.00017267820151836504 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1490_text_document +0.00015923068318166766 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0863_text_document +0.00015934534716367842 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1129_text_document +0.00015526632017607095 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0737_text_document +0.00015166226718563977 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0791_text_document +0.00016072059902134906 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0884_text_document +0.0001803142413148948 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0708_text_document +0.00019507013635168262 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0667_text_document +6.797842868279661e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1452_text_document +0.00015425108365976967 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1048_text_document +0.0001557566535372043 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0609_text_document +0.0001486922014832451 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1049_text_document +0.00010314726295669035 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0336_text_document +0.00015110489846907377 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0800_text_document +0.0001282191763186718 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0196_text_document +0.00017192028765361552 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0931_text_document +0.00014468658775627455 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1072_text_document +0.00012350214919963946 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0207_text_document +0.00012823594249053026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0144_text_document +0.00016124863235230142 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0395_text_document +0.0001460339674439275 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0648_text_document +0.00019945548397061228 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1389_text_document +0.00017712592386337505 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0582_text_document +0.00016961826143628697 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0433_text_document +0.00018215906534328343 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0553_text_document +0.00014668917620916662 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1088_text_document +0.00017591692917269525 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0391_text_document +0.00015546530486179318 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0304_text_document +0.00014159774188958436 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0110_text_document +0.00016661197161499275 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1310_text_document +0.0001647494786660025 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0398_text_document +0.00015207610020583033 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0446_text_document +0.00017874839079063128 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0574_text_document +0.00015464615502950034 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0772_text_document +0.00017998996521188286 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0686_text_document +0.00022374552824813425 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1335_text_document +8.318738874257729e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1417_text_document +0.0001980263180897782 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1388_text_document +0.0001634034913458812 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0855_text_document +0.00015856515338774624 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0457_text_document +0.00021503996494446354 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1093_text_document +0.0001750235089084395 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0903_text_document +0.00018680157180581995 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0669_text_document +0.00016797750803698952 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0860_text_document +0.0001526585274766112 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0754_text_document +0.00016251427729059063 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0289_text_document +0.00016658951661432627 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0946_text_document +0.00014846377243224343 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1275_text_document +0.00014532755538369286 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0024_text_document +0.00020085782167503894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0310_text_document +0.00016995517105782582 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1153_text_document +0.00017099512023635913 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0958_text_document +0.00016745325176526242 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0385_text_document +0.00015488747143365428 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1115_text_document +0.0001900892920247819 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0550_text_document +0.00014361854754586879 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0653_text_document +0.00015157203224797297 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1243_text_document +0.00021193298580903045 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1030_text_document +0.00015019624338905051 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0775_text_document +0.0001763798973561693 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0382_text_document +0.00020921420036595025 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1369_text_document +8.551501141789569e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1429_text_document +0.00018751712868149065 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1400_text_document +0.00016829537532338053 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1213_text_document +0.00015084066181358894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0709_text_document +0.00016195928581990262 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0842_text_document +0.00018160270203746462 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0691_text_document +0.0001466611510614814 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1066_text_document +0.00016057775736524703 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0766_text_document +0.00015331050001960014 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1139_text_document +0.0001640469483576742 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0499_text_document +0.0001750777552857391 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0699_text_document +0.00012850484180604435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0200_text_document +0.00012402353499660769 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0239_text_document +0.00016422654526752735 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0439_text_document +0.00017376594719277543 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1155_text_document +0.0001598186882709137 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0525_text_document +0.00016917681978282753 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0523_text_document +0.00015263696687634908 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0079_text_document +0.00016039189468868627 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0416_text_document +0.00017618800875841804 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1481_text_document +0.0001603990699730604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0727_text_document +0.00016916333202617474 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0307_text_document +0.00012948524611971614 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0138_text_document +6.934197346191516e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1427_text_document +0.00017885898529592837 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1475_text_document +0.00015950113709324278 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0429_text_document +0.0001227475965565912 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0206_text_document +0.00016574764282947948 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0375_text_document +6.94467160073517e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1448_text_document +0.00021898407239821863 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1352_text_document +0.0001729576935220182 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0270_text_document +0.00014175309815679324 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0103_text_document +0.00015222256729727473 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0778_text_document +0.0002006873061783732 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1408_text_document +0.0001633023981526495 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1223_text_document +0.00018248734503882182 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1472_text_document +0.00014924831298511557 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0813_text_document +0.0002023904846194145 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1384_text_document +0.00015017176132147363 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0629_text_document +0.00010193584093009983 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0337_text_document +0.00016193099961740715 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0848_text_document +0.0001352750889245726 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0124_text_document +0.00015742420916536968 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0753_text_document +0.00019506168512525843 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1410_text_document +0.00012967156389355632 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0243_text_document +0.0001590087132629504 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0714_text_document +0.0001472438118601176 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1001_text_document +0.00016995459524808012 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1317_text_document +0.0001376594963234441 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0112_text_document +0.00014719759568221027 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1260_text_document +0.00016740162676634268 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0251_text_document +0.00014556055909872035 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1272_text_document +0.0001462316125597202 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0624_text_document +0.00017233201215418483 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1158_text_document +0.00016532348471925265 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0444_text_document +0.0001732052183826621 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0925_text_document +0.00014854511323026862 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1050_text_document +0.00015156958641033844 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1134_text_document +0.00017965845673990997 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0695_text_document +0.0001682718027734702 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0508_text_document +0.00014679933705210826 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0615_text_document +0.00016379095940965174 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0265_text_document +0.0001595953223745724 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1316_text_document +0.00014937910370135967 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1148_text_document +0.00016626192441576874 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1315_text_document +0.00012644557806052652 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0226_text_document +0.0002038866364425408 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0320_text_document +0.00012781772365927512 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0237_text_document +0.00014873714013968685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1353_text_document +0.00016647930161999792 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0287_text_document +0.00017174129317793987 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0262_text_document +0.0001538621049979398 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1067_text_document +0.0001563063688577774 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0430_text_document +0.0001379470171727049 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0001_text_document +0.00020088705868563327 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0360_text_document +0.00013316662932109192 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0017_text_document +0.0001901234116854142 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1468_text_document +0.00017072715683731255 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0501_text_document +0.0001482947975425305 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1141_text_document +0.0001427297599327488 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0998_text_document +0.00015030760729530093 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1145_text_document +0.00017934778278835527 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1473_text_document +0.0001624712517575163 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0396_text_document +0.00017056782495579372 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0913_text_document +0.00016750774701331024 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0380_text_document +0.00016144568293099387 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0544_text_document +0.0001916682997566083 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1411_text_document +0.00014869336257922915 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0106_text_document +0.0001578604687567487 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0442_text_document +0.00017245289000782427 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0418_text_document +0.00016980011307450558 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0491_text_document +0.00017116434783254355 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0368_text_document +0.00016266646479915197 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0296_text_document +0.00017164098089283116 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0483_text_document +0.00016127833957743735 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0536_text_document +0.00016314013780926182 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1306_text_document +0.00022395345242528445 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1347_text_document +0.00018132340700948072 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1467_text_document +0.00016629277085070321 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0969_text_document +0.0001452258606588068 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1036_text_document +0.00014466593622241737 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0010_text_document +0.00022820951721846 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1044_text_document +0.00014790724622100283 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0619_text_document +6.63903375071058e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1437_text_document +0.00015582467512476796 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0282_text_document +0.000166431942170938 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1162_text_document +0.00015151942372449155 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0750_text_document +0.00013878034925318548 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0126_text_document +0.0001276561557767454 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0192_text_document +0.00021753021732075337 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1362_text_document +6.56791423000064e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1435_text_document +0.00021714461635167153 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1359_text_document +0.00017732782166807323 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0685_text_document +0.00018223642150176753 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1469_text_document +0.00014959851925403812 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1338_text_document +0.00015949039029928227 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0273_text_document +0.00016354645980648047 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0060_text_document +0.00013120579158737794 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0229_text_document +0.00020283796144492631 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0358_text_document +0.00015607531075694943 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0743_text_document +0.00016785011193580181 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1294_text_document +0.0002289988842393063 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1136_text_document +0.00012609574092355686 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0155_text_document +0.00015391393870532928 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1122_text_document +0.00017291662195148216 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0707_text_document +0.00016930397761791359 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1301_text_document +0.0001503707315668261 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0814_text_document +0.00013755997103848938 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0083_text_document +0.0001641777724672735 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0303_text_document +0.00016249963588376947 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1208_text_document +0.00015091532371495184 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1116_text_document +0.00016925706424090195 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0512_text_document +0.00012331579081225924 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0205_text_document +0.00016572415150664923 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0847_text_document +0.00015896504445652774 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0464_text_document +0.0002172811514018951 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1365_text_document +0.0001467552821914243 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1266_text_document +0.00016053641120205404 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0278_text_document +0.00016092234700721057 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0075_text_document +0.00014739875025912542 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0201_text_document +0.00016162938474159285 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1308_text_document +0.00014983174047713567 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1235_text_document +0.00014507763071409317 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0056_text_document +0.0001514590704245521 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0061_text_document +0.00016877330395559598 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0917_text_document +0.00016571159380006375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0401_text_document +0.00017140951259524858 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0617_text_document +0.00014775266341610126 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1236_text_document +0.0001618545976514836 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0974_text_document +0.00017574442993022852 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0693_text_document +0.00015133337648195428 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1245_text_document +0.0001641743334029538 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0524_text_document +0.00016161723136536173 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0541_text_document +0.00020025714719197222 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0351_text_document +0.00013146292989858402 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0099_text_document +0.00015213655142952746 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0614_text_document +0.0001594237312959875 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0403_text_document +0.00019928522546740278 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0321_text_document +0.0001718576209613083 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0370_text_document +0.00019870747890711344 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1379_text_document +0.00014150759607607492 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0084_text_document +0.00018497707327481416 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0666_text_document +0.0001494245626623757 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0783_text_document +0.00014567464546593923 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0028_text_document +0.00012921109005833406 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0208_text_document +0.00015522839502673537 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1244_text_document +0.00016889787876953445 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1300_text_document +0.00016128963916680415 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0883_text_document +0.00015501692461064587 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1239_text_document +0.00021246309426624894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0308_text_document +0.00014578020908177723 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0035_text_document +0.00017141531649575344 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0486_text_document +0.00014815746443624483 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1271_text_document +0.00013641616762289285 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0078_text_document +0.00015052190193183982 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1127_text_document +0.0001719375436698932 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0377_text_document +0.00014287195470541003 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0652_text_document +6.855329371241895e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1443_text_document +0.0001610074517778759 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0711_text_document +0.0001617237746700382 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0838_text_document +0.00020562372967146788 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0322_text_document +0.00015409004826587622 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1222_text_document +0.00016204355350038122 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0290_text_document +0.00016418022845829302 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0857_text_document +0.00015404756921054644 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1062_text_document +0.000126591638339678 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0186_text_document +0.00015729247418431926 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1114_text_document +0.00014957377500349715 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1073_text_document +0.00014652199914549352 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0802_text_document +0.00015695341294768054 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1123_text_document +0.00016273826976360268 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0281_text_document +0.0001585705206926955 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0770_text_document +0.00015456957120516966 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0620_text_document +0.00017625472845639211 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0581_text_document +0.0001552911388847687 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0456_text_document +0.00014069857571193372 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0094_text_document +0.0001389384065464389 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0092_text_document +0.00013931998654942435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0645_text_document +6.697683673653888e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1426_text_document +0.00021210844645545962 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0334_text_document +0.00014137119144044108 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0809_text_document +0.0001673120682697486 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0422_text_document +0.000160278712649626 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1170_text_document +0.00013434353209494217 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0153_text_document +0.0001256954694412835 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0147_text_document +0.00013924245755776423 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0100_text_document +0.0001472288250125752 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0636_text_document +0.00014097711532807256 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1026_text_document +0.00014773134717397182 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0639_text_document +0.00015352249953986432 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0081_text_document +0.00016123223192793427 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0720_text_document +0.00021482114302636226 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1094_text_document +0.00015645132444863934 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0381_text_document +0.00020944206443748354 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0342_text_document +0.0002062452865086777 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1387_text_document +0.00019918740595113046 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0347_text_document +0.00012991208314417127 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0191_text_document +0.00016986053451494 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0409_text_document +0.00015370758356460988 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1118_text_document +0.00012877020616415807 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0159_text_document +0.00017410552918525 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0485_text_document +0.00014130939816487588 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1009_text_document +0.00016995001697396185 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1165_text_document +0.00013186413368723246 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0116_text_document +0.0001780030357606211 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0586_text_document +0.00019946492842372164 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1381_text_document +0.00018695527893767175 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0668_text_document +0.0001509922737380354 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0992_text_document +0.00013508028066455278 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0073_text_document +0.00019538060272292147 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0664_text_document +0.00014994073210138526 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1264_text_document +0.00013272391894576728 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0183_text_document +0.00014297447569020546 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0822_text_document +0.00022036909181227284 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0338_text_document +0.00016213052378631979 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1200_text_document +0.00016695883561519947 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0479_text_document +0.00016163212750933134 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0972_text_document +0.00014238258162886823 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0826_text_document +0.0001495758517091787 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0780_text_document +0.00015672227745549565 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0432_text_document +0.00016811295148550867 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0373_text_document +0.00015381983397418742 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0767_text_document +0.00015443879289861833 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0469_text_document +0.0001563389422706874 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0779_text_document +0.0001249130802778805 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0217_text_document +0.0001867001004554042 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1177_text_document +0.0001666242847378148 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0364_text_document +0.00014083154963158667 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1014_text_document +0.00015687949743293156 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0461_text_document +0.00016106205104171245 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1209_text_document +0.00015977557248552813 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0976_text_document +0.00015626689249684247 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0804_text_document +0.00018295367015701234 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0549_text_document +0.00016294566957482463 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1180_text_document +0.00012895551040454305 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0120_text_document +0.00020400658875872852 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1401_text_document +0.00018195688820768198 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0559_text_document +0.00017126939272322272 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0933_text_document +0.00016370097379399474 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1157_text_document +0.00016306170313565844 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0114_text_document +0.00016611485489014685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0480_text_document +0.00016347151406159876 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0964_text_document +0.00014631075526270524 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1023_text_document +0.0001766669845430746 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0580_text_document +0.0001530457703608652 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1053_text_document +0.00017282679878998538 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0266_text_document +0.00015993384615869773 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0438_text_document +0.00016368168822886812 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0875_text_document +0.0001642440794865544 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1286_text_document +0.00015130922676851358 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0123_text_document +0.00015810251823530065 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0729_text_document +0.00017183092026629049 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0937_text_document +0.00015814901712857293 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0295_text_document +0.00016113452432452787 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0277_text_document +0.00015619849162394083 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0069_text_document +0.00020959278805355666 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1006_text_document +0.00014410102715087833 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0994_text_document +0.00016078851749564712 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1100_text_document +0.00017027983908149928 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0960_text_document +0.00020003986430140833 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1407_text_document +0.00015782966699670828 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0741_text_document +0.00016914121141032142 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0374_text_document +0.00022097211894746266 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1065_text_document +0.0001571948997030719 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0730_text_document +0.00015974850807369602 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0306_text_document +0.00021789133441721216 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1074_text_document +0.0001233286077686087 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0222_text_document +0.0001618458158759695 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0008_text_document +0.0001615455933676868 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0763_text_document +0.00018103419979578447 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0563_text_document +0.00016396951255152545 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1172_text_document +0.00015073530126545124 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0632_text_document +0.00021341112078889535 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1092_text_document +0.00017307825526693754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0906_text_document +0.00015218172857770168 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0613_text_document +0.00015344576003029675 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0890_text_document +0.0002173692545799705 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1070_text_document +0.00013765738216305341 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0136_text_document +0.0001734720773190499 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0947_text_document +0.00014938640060072316 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1238_text_document +0.00016249447142088974 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0882_text_document +0.00016869913054486763 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0495_text_document +0.00016113655951414623 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1128_text_document +0.00014335521205971165 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1279_text_document +0.00017876209492720022 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0575_text_document +0.00015697716171522857 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1201_text_document +0.00016786332713047465 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0404_text_document +0.00015265307759079717 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0773_text_document +0.00017523326611003096 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0898_text_document +0.0001617143182584976 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1321_text_document +0.00016606821407511515 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0846_text_document +0.00016912687076932746 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0518_text_document +0.00021366183110486437 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1367_text_document +0.00020204352923874265 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0345_text_document +0.00016502825564571993 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0015_text_document +0.00015942624346907078 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0451_text_document +0.00016489707368570657 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1297_text_document +0.00015383928605482828 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1241_text_document +0.00020098250321233543 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0987_text_document +0.00013164332599265317 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0150_text_document +0.00017734345224022555 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0696_text_document +0.00014991203547650442 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0807_text_document +0.0001485622941114972 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1035_text_document +6.618997286354476e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1434_text_document +0.00014823391536404723 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0649_text_document +0.00019808432009137914 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1404_text_document +0.00015719535570287444 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1281_text_document +0.00015574089977064715 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0768_text_document +0.00015602103595017177 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0738_text_document +0.0001628543123496676 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1309_text_document +0.00017573876028003664 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1133_text_document +0.00022550434769031454 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1058_text_document +0.0001552331578668301 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0427_text_document +0.00015324611041067144 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0468_text_document +0.000170011687280753 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0500_text_document +0.0001447196970422825 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0148_text_document +0.0001784724948903193 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0386_text_document +0.0001498982424414089 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0616_text_document +0.0001623859542981665 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0428_text_document +0.00013361743194427285 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0118_text_document +0.00017779427607269958 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0569_text_document +0.00020986255379125862 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0323_text_document +0.00015466768584649975 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0718_text_document +0.00014830412557015837 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1252_text_document +0.00016505017860901167 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1191_text_document +0.00013490785926470453 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0143_text_document +0.00020564465624924356 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0314_text_document +0.00018728523619804526 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0671_text_document +0.00017721472581997018 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1486_text_document +0.00017590602804728252 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0596_text_document +0.0011586118073822192 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0012_text_document +0.001160584792279032 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0001_text_document +0.0011596589068878752 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0073_text_document +0.0011597796811325983 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0045_text_document +0.0011595856370794394 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0084_text_document +0.0011595638420318743 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0065_text_document +0.0011595056492598743 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0032_text_document +0.00019344370639184713 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0085_text_document +0.0011607702873631174 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0064_text_document +0.0011599628933925152 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0025_text_document +0.0011589412563486842 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0042_text_document +0.0011602895576833848 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0055_text_document +0.0011597331783997562 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0023_text_document +0.0011594813904417097 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0028_text_document +0.0011606546026116473 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0036_text_document +0.0011598078108047945 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0037_text_document +0.0011617470351964222 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0016_text_document +0.0011581358123008063 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0000_text_document +0.0011596262720494357 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0019_text_document +0.0011610223957263077 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0046_text_document +0.0011584882104731472 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0059_text_document +0.0011620862692660026 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0017_text_document +0.001161457666864065 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0072_text_document +0.0011590371889477892 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0033_text_document +0.0011609938621736805 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0006_text_document +0.001161141044100396 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0061_text_document +0.0011594671231655441 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0071_text_document +0.0011597802979499891 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0057_text_document +0.0011595547972095988 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0011_text_document +0.001158806362284141 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0047_text_document +0.0011602692107071176 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0004_text_document +0.0011585986122886333 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0009_text_document +0.001158546881103677 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0070_text_document +0.0011602619863458935 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0018_text_document +0.0011592879277164485 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0054_text_document +0.0011604245887073812 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0049_text_document +0.0011591629192252762 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0003_text_document +0.0011597122241032863 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0021_text_document +0.0011606137082184533 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0051_text_document +0.0011606825853273499 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0074_text_document +0.0011595027201270456 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0027_text_document +0.0011613556290724053 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0050_text_document +0.0011598527410031864 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0079_text_document +0.001158761645522561 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0022_text_document +0.0011593918929374918 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0030_text_document +0.0011580315931549376 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0034_text_document +0.0011604072238482566 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0020_text_document +0.001158231578949041 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0035_text_document +0.0011598418077399845 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0015_text_document +0.001159325008238741 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0066_text_document +0.0011599099580640463 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0044_text_document +0.0011584870238244551 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0010_text_document +0.0011599085999661118 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0002_text_document +0.0011589833818773999 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0041_text_document +0.0011606481795132088 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0067_text_document +0.001161458692060555 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0048_text_document +0.0011622006339082917 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0013_text_document +0.001159737838020273 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0083_text_document +0.0011590437070181112 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0053_text_document +0.0011634257187213065 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0008_text_document +0.0011590984828017124 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0014_text_document +0.0011602559046463836 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0069_text_document +0.001160367797520441 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0056_text_document +0.0011604031640501619 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0062_text_document +0.0011610001203209528 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0031_text_document +0.0011613029071807482 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0007_text_document +0.0011606767800460063 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0077_text_document +0.0011594886287987906 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0058_text_document +0.0011611587128695477 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0076_text_document +0.0011625946322648768 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0078_text_document +0.0011598797869962062 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0005_text_document +0.0011606009649910922 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0081_text_document +0.0011598277708955993 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0040_text_document +0.0011608718238032025 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0068_text_document +0.0011604076497221763 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0075_text_document +0.0011584462678902703 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0063_text_document +0.0011603135740733204 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0029_text_document +0.0011622164917135802 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0039_text_document +0.001160612435595214 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0026_text_document +0.0011604065740406324 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0052_text_document +0.0011627256914647336 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0024_text_document +0.0011591747252301023 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0043_text_document +0.0011607364688750981 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0060_text_document +0.0011606778657245907 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0082_text_document +0.0011588098597487147 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0080_text_document +0.0011601593642272241 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0038_text_document +0.001715327772970356 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0039_text_document +0.0017476947624003078 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0014_text_document +0.0017267559770325844 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0034_text_document +0.0011498438827029142 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0007_text_document +0.0017659420675981785 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0020_text_document +0.0017365986177600462 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0026_text_document +0.0017510246549559635 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0036_text_document +0.0017252368964000024 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0030_text_document +0.0017577211312850632 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0015_text_document +0.0017721505923411433 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0018_text_document +0.0017199608019077789 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0033_text_document +0.001763655692201186 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0027_text_document +0.0017368406475642309 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0023_text_document +0.0017159618112572714 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0024_text_document +0.0017364731392365761 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0009_text_document +0.0017439630879065747 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0025_text_document +0.0017500328397861851 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0010_text_document +0.0017865858138443973 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0032_text_document +0.0017566977721906304 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0029_text_document +0.0017371397837150156 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0021_text_document +0.0017582147893048033 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0040_text_document +0.0011272603991442094 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0000_text_document +0.0017149137099469502 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0013_text_document +0.0011524697523897062 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0005_text_document +0.0017728444816641966 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0022_text_document +0.0017617078558540117 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0011_text_document +0.0017090530525973265 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0038_text_document +0.0011492816454542877 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0003_text_document +0.0017362843828160517 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0019_text_document +0.0017368328836137243 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0031_text_document +0.0017467857780397841 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0012_text_document +0.0017828246319376343 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0041_text_document +0.0011581997009018688 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0004_text_document +0.0011523562998874855 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0001_text_document +0.0017729287558360615 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0037_text_document +0.0011559024877821585 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0006_text_document +0.001756650413292843 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0016_text_document +0.001156615396883475 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0002_text_document +0.0017153661936226234 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0017_text_document +0.001764036695919234 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0028_text_document +0.001753337750065643 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0035_text_document +0.0012705251979895095 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0008_text_document +0.00269701351832541 /eagle/datasets//dolma/data_Llama2Tokenizer/gutenberg-books//books-0001_text_document +0.002871694450382552 /eagle/datasets//dolma/data_Llama2Tokenizer/gutenberg-books//books-0000_text_document +0.0009312920312920378 /eagle/datasets//dolma/data_Llama2Tokenizer/gutenberg-books//books-0002_text_document +0.003491293635742718 /eagle/datasets//dolma/data_Llama2Tokenizer/wiki-en-simple//en_simple_wiki-0000_text_document +0.0018087063642572823 /eagle/datasets//dolma/data_Llama2Tokenizer/wiki-en-simple//en_simple_wiki-0001_text_document +0.002558997095701873 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_35_text_document +0.0030813105581653888 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_33_text_document +0.0019635118705125343 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_59_text_document +0.004901530213612799 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_3_text_document +0.003793853523990452 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_25_text_document +0.0025955935796863213 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_13_text_document +0.002786190590856374 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_43_text_document +0.003860604753313106 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_30_text_document +0.00262527053779086 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_53_text_document +0.0025961058890461132 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_12_text_document +0.002517804312074853 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_47_text_document +0.001965319952716967 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_57_text_document +0.0024031443284573315 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_49_text_document +0.0021544653036229956 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_28_text_document +0.001966106318481444 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_60_text_document +0.004942382135998647 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_5_text_document +0.004476005981762131 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_38_text_document +0.003703151369852988 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_18_text_document +0.0035249004189965697 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_8_text_document +0.002202908872645994 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_29_text_document +0.0019659892928062975 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_58_text_document +0.0033082157671442004 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_26_text_document +0.004728972404877124 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_39_text_document +0.004933803932309063 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_1_text_document +0.004932534365989358 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_2_text_document +0.002416452767493874 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_48_text_document +0.002620239890699942 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_54_text_document +0.004920228925189318 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_6_text_document +0.003526184767940545 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_10_text_document +0.003478586000712196 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_23_text_document +0.004325225087180493 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_41_text_document +0.0029001545338120283 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_21_text_document +0.003235865499567488 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_24_text_document +0.003708174490218723 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_17_text_document +0.0037061510536654098 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_20_text_document +0.0034825398408553174 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_22_text_document +0.0023953996094720226 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_50_text_document +0.0026860110123915503 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_46_text_document +0.002775825753828905 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_37_text_document +0.0031227832671625543 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_42_text_document +0.0029663524802271864 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_27_text_document +0.0038380125626091056 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_61_text_document +0.002556361839801216 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_36_text_document +0.0025955479628912517 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_14_text_document +0.0010043379135763284 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_62_text_document +0.0037134182490424516 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_19_text_document +0.0019352359233920818 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_55_text_document +0.002732128376890259 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_11_text_document +0.004389286673762447 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_7_text_document +0.0025959811034233144 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_15_text_document +0.004279327177425728 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_31_text_document +0.0049375674289130365 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_4_text_document +0.004749708406741731 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_40_text_document +0.0025599583023118502 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_34_text_document +0.0026884724043408277 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_44_text_document +0.0019620663976446513 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_56_text_document +0.002683235274555792 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_45_text_document +0.004312637092757081 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_0_text_document +0.0035302801429202123 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_9_text_document +0.0024212069391470387 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_51_text_document +0.00401255756160585 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_32_text_document +0.003602075549037339 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_16_text_document +0.002645748366381084 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_52_text_document From 6bcac4ee574da196d0d7da548693b2e8f62e0146 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 6 Mar 2024 22:17:16 -0600 Subject: [PATCH 098/268] Update `train_llama_alcf_polaris.sh` --- train_llama_alcf_polaris.sh | 78 +++++++++++++------------------------ 1 file changed, 26 insertions(+), 52 deletions(-) diff --git a/train_llama_alcf_polaris.sh b/train_llama_alcf_polaris.sh index e627c6f8bf..b990cc124d 100644 --- a/train_llama_alcf_polaris.sh +++ b/train_llama_alcf_polaris.sh @@ -5,6 +5,7 @@ #PBS -l select=48 #PBS -l filesystems=eagle:home + function sourceFile() { fp="$1" echo "source-ing ${fp}" @@ -17,42 +18,27 @@ function sourceFile() { } # +++++++++++++++ SCRIPT START ++++++++++++++++++++++ -# ---- source ./helpers_alcf.sh --------------------- +cd "${PBS_O_WORKDIR}" || exit HERE=$(python3 -c 'import os; print(os.getcwd())') -sourceFile "${HERE}/helpers_alcf.sh" || exit +sourceFile "${HERE}/ALCF_utils/helpers_alcf.sh" || exit + +# ---- fns from ./helpers_alcf.sh ------------------- +setEnv || exit +saveDSenv || exit +ezpz || exit +makeHostfiles || exit +DFL="/eagle/datasets/dolma/data_file_list_reweighted.txt" +setupData "${DATA_FILE_LIST:-${DFL}}" || exit -# ---- load conda ----------------------------------- -module load conda/2023-10-04; conda activate base -if [[ "${VIRTUAL_ENV}" ]]; then - echo "Caught VIRTUAL_ENV = ${VIRTUAL_ENV} from environment!!" -else - echo "Not using VIRTUAL_ENV" - # sourceFile "${HERE}/venvs/polaris/2023-10-04/bin/activate" || exit -fi echo "Using $(which python3)" -# ---- fns from ./helpers_alcf.sh ------------------- -ezpz -makeHostfiles -saveDSenv -# setupData "${DOLMA_CHUNK_IDX:-00}" -# export DOLMA_CHUNK_IDX="${DOLMA_CHUNK_IDX:-0}" -# -# ---- DATA SETUP ------------------------------------ -dfl_debug="./data_file_list_shuf_debug.txt" -DATA_FILE_LIST="${DATA_FILE_LIST:-${dfl_debug}}" && export DATA_FILE_LIST="${DATA_FILE_LIST}" -NUM_DOCS=$(wc -l < "${DATA_FILE_LIST}") && export NUM_DOCS="${NUM_DOCS}" -WEIGHT_SUM="$(sumWeights "${DATA_FILE_LIST}")" && export WEIGHT_SUM="${WEIGHT_SUM}" -DFL_STEM=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") && export DFL_STEM="${DFL_STEM}" -dcp="${HERE}/.cache/${DFL_STEM}-index-cache" -DATA_CACHE_PATH="${DATA_CACHE_PATH:-${dcp}}" && export DATA_CACHE_PATH="${DATA_CACHE_PATH}" -mkdir -p "${DATA_CACHE_PATH}" -if [[ -n "${DOLMA_CHUNK_IDX}" ]]; then - echo "Using DOLMA CHUNK ${DOLMA_CHUNK_IDX} from ${DATA_FILE_LIST} with ${NUM_DOCS} documents..." -else - echo "Using NUM_DOCS=${NUM_DOCS} documents from DATA_FILE_LIST=${DATA_FILE_LIST}" -fi -echo "DOCUMENT WEIGHT_SUM: ${WEIGHT_SUM}" +# mkdir -p "${DATA_CACHE_PATH}" +# if [[ -n "${DOLMA_CHUNK_IDX}" ]]; then +# echo "Using DOLMA CHUNK ${DOLMA_CHUNK_IDX} from ${DATA_FILE_LIST} with ${NUM_DOCS} documents..." +# else +# echo "Using NUM_DOCS=${NUM_DOCS} documents from DATA_FILE_LIST=${DATA_FILE_LIST}" +# fi +# echo "DOCUMENT WEIGHT_SUM: ${WEIGHT_SUM}" # ---------------------------------------------------- # ---- Parallelism Settings -------------------------- @@ -65,6 +51,7 @@ export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} # ---------------------------------------------------- # ---- Llama2 7B Config ------------------------------ +export MODEL_KEY="Llama-7B" export HEADS=${HEADS:-32} export NLAYERS=${NLAYERS:-32} export HIDDEN=${HIDDEN:-4096} @@ -73,7 +60,7 @@ export FFN_HIDDEN_SIZE=${FFN_HIDDEN_SIZE:-11008} # ---------------------------------------------------- # ---- Run Settings ---------------------------------- -export LR=${LR:-0.00015} +export LR=${LR:-0.0003} export SEQ=${SEQ:-4096} # SEQ_LEN: 4096 export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 export ZERO_STAGE=${ZERO_STAGE:-2} @@ -99,21 +86,6 @@ echo "- MODEL_TYPE: ${MODEL_TYPE}" echo "- Using DATA_FILE_LIST: ${DATA_FILE_LIST}" echo "++++++++++++++++++++++++++++++++++++++++++++++++++" -# if [[ "${DOLMA_CHUNK_IDX}" == 0 ]]; then -# TRAIN_ITER=78739 -# elif [[ "${DOLMA_CHUNK_IDX}" == 1 ]]; then -# TRAIN_ITER=81008 -# elif [[ "${DOLMA_CHUNK_IDX}" == 2 ]]; then -# TRAIN_ITER=79591 -# elif [[ "${DOLMA_CHUNK_IDX}" == 3 ]]; then -# TRAIN_ITER=78552 -# else -# echo "caught DOLMA_CHUNK_IDX=${DOLMA_CHUNK_IDX}" -# TRAIN_ITER="${TRAIN_ITER:-320000}" -# echo "Setting TRAIN_ITER=${TRAIN_ITER}" -# # echo "Unknown DOLMA_CHUNK_IDX: ${DOLMA_CHUNK_IDX}" -# fi - # +++++NOTES ++++++++++++++++++++++++++++++++++++++++++++++++++ # XXX: # - need to merge *.json files @@ -211,7 +183,6 @@ EXEC="pretrain_gpt_alcf.py" # --vocab-file $VOCAB_FILE \ # --merge-file $MERGE_FILE \ # --lr-decay-iters 320000 \ - # --num-workers 0 \ # --lr-warmup-iters 5000 \ # --lr-decay-iters 10000 \ # --num-workers 4 \ @@ -220,7 +191,7 @@ EXEC="pretrain_gpt_alcf.py" run_cmd=" deepspeed $launcher ${EXEC} \ --$DTYPE \ - --split 90,5,5 \ + --split 100,0,0 \ --use-flash-attn-v2 \ --no-bias-gelu-fusion \ --lr-decay-style cosine \ @@ -232,6 +203,7 @@ run_cmd=" --use-checkpoint-opt_param-scheduler \ --lr ${LR} \ --log-interval 1 \ + --num-workers 0 \ --seq-length $SEQ \ --save ${CKPT_DIR} \ --load ${CKPT_DIR} \ @@ -257,8 +229,10 @@ run_cmd=" ${LLAMA_ARGS} \ ${gpt_args[*]} \ $custom_args \ - >> ${OUTPUT_LOG} 2>&1 & + |& tee ${OUTPUT_LOG} " + # >> ${OUTPUT_LOG} 2>&1 & + # >> ${OUTPUT_LOG} 2>&1 & # |& tee $OUTPUT_DIR/output.log # ${EXTRA_ARGS} \ @@ -268,7 +242,7 @@ ds_report echo "${run_cmd}" -echo "[!! NOTE] View output at:" +printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" # echo "${OUTPUT_LOG}" eval "${run_cmd}" From e7d76d3904c7b0b8cc5dedcb4290a6def5d812e3 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 7 Mar 2024 10:27:39 -0600 Subject: [PATCH 099/268] Move `ALCF_utils/*` to `ALCF/*` - move (existing) contents from `ALCF/*` to `ALCF/pre-AuroraGPT/*` - Rename `ALCF_utils/helpers_alcf.sh` -> `ALCF/helpers.sh` --- {ALCF_utils => ALCF}/fused_stackcode.py | 0 .../fused_stackcode_bysize.py | 0 ALCF/helpers.sh | 351 ++ ALCF/{ => pre-AuroraGPT}/README.md | 0 ALCF/{ => pre-AuroraGPT}/args.sh | 0 ALCF/{ => pre-AuroraGPT}/launch.sh | 0 ALCF/{ => pre-AuroraGPT}/llama2_vars.sh | 0 ALCF/{ => pre-AuroraGPT}/model.sh | 0 ALCF/{ => pre-AuroraGPT}/setup.sh | 0 ALCF/{ => pre-AuroraGPT}/submit-pbs.sh | 0 ALCF/{ => pre-AuroraGPT}/submit.sh | 0 ALCF/{ => pre-AuroraGPT}/train-gpt3.sh | 0 ALCF/{ => pre-AuroraGPT}/train-llama.sh | 0 {ALCF_utils => ALCF}/test_blend.sh | 0 {ALCF_utils => ALCF}/test_blend_full.sh | 0 .../test_blendable_dataset.py | 0 ALCF_utils/data_file_list_polaris.txt | 3074 ----------------- ALCF_utils/helpers_alcf.sh | 176 - 18 files changed, 351 insertions(+), 3250 deletions(-) rename {ALCF_utils => ALCF}/fused_stackcode.py (100%) rename {ALCF_utils => ALCF}/fused_stackcode_bysize.py (100%) create mode 100644 ALCF/helpers.sh rename ALCF/{ => pre-AuroraGPT}/README.md (100%) rename ALCF/{ => pre-AuroraGPT}/args.sh (100%) rename ALCF/{ => pre-AuroraGPT}/launch.sh (100%) rename ALCF/{ => pre-AuroraGPT}/llama2_vars.sh (100%) rename ALCF/{ => pre-AuroraGPT}/model.sh (100%) rename ALCF/{ => pre-AuroraGPT}/setup.sh (100%) rename ALCF/{ => pre-AuroraGPT}/submit-pbs.sh (100%) rename ALCF/{ => pre-AuroraGPT}/submit.sh (100%) rename ALCF/{ => pre-AuroraGPT}/train-gpt3.sh (100%) rename ALCF/{ => pre-AuroraGPT}/train-llama.sh (100%) rename {ALCF_utils => ALCF}/test_blend.sh (100%) rename {ALCF_utils => ALCF}/test_blend_full.sh (100%) rename {ALCF_utils => ALCF}/test_blendable_dataset.py (100%) delete mode 100644 ALCF_utils/data_file_list_polaris.txt delete mode 100644 ALCF_utils/helpers_alcf.sh diff --git a/ALCF_utils/fused_stackcode.py b/ALCF/fused_stackcode.py similarity index 100% rename from ALCF_utils/fused_stackcode.py rename to ALCF/fused_stackcode.py diff --git a/ALCF_utils/fused_stackcode_bysize.py b/ALCF/fused_stackcode_bysize.py similarity index 100% rename from ALCF_utils/fused_stackcode_bysize.py rename to ALCF/fused_stackcode_bysize.py diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh new file mode 100644 index 0000000000..6ff3447f05 --- /dev/null +++ b/ALCF/helpers.sh @@ -0,0 +1,351 @@ +#!/bin/bash --login + +buildCLIargs() { + custom_args=" $@" + export CLI_ARGS=" + --$DTYPE \ + --num-workers 0 \ + --split 100,0,0 \ + --log-interval 1 \ + --use-flash-attn-v2 \ + --no-bias-gelu-fusion \ + --lr-decay-style cosine \ + --no-bias-dropout-fusion \ + --no-masked-softmax-fusion \ + --tokenizer-type Llama2Tokenizer \ + --no-gradient-accumulation-fusion \ + --accumulate-allreduce-grads-in-fp32 \ + --use-checkpoint-opt_param-scheduler \ + --lr ${LR} \ + --save ${CKPT_DIR} \ + --load ${CKPT_DIR} \ + --seq-length ${SEQ} \ + --num-layers ${NLAYERS} \ + --hidden-size ${HIDDEN} \ + --train-iters ${TRAIN_ITER} \ + --eval-iters ${EVAL_ITERS} \ + --distributed-backend ${NCCL} \ + --num-attention-heads ${HEADS} \ + --save-interval ${SAVE_INTERVAL} \ + --eval-interval ${EVAL_INTERVAL} \ + --max-position-embeddings ${SEQ} \ + --micro-batch-size ${MICRO_BATCH} \ + --data-file-list ${DATA_FILE_LIST} \ + --tensor-model-parallel-size ${TP} \ + --global-batch-size ${GLOBAL_BATCH} \ + --pipeline-model-parallel-size ${PP} \ + --num-key-value-heads ${NUM_KV_HEAD} \ + --data-cache-path ${DATA_CACHE_PATH} \ + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + --tokenizer-model ${TOKENIZER_MODEL} \ + $ds_args \ + ${LLAMA_ARGS} \ + ${gpt_args[*]} \ + ${custom_args} \ + " +} + +printJobInfo() { + echo "++++++++++++++++++++++++++++++++++++++++++++++++++" + echo "- MPICH_DIR=$MPICH_DIR" + echo "- Using $(which python3)" + echo "- WORLD_SIZE:${WORLD_SIZE}" + echo "- NCCL: ${NCCL:-nccl}" + echo "- MODEL_TYPE: ${MODEL_TYPE}" + echo "- Using DATA_FILE_LIST: ${DATA_FILE_LIST}" + echo "++++++++++++++++++++++++++++++++++++++++++++++++++" +} + +function setDSlauncher() { + # launcher setting + outdir=$1 + # hfds=$1 + # hfmpi=$2 + # here=$(python3 -c 'import os; print(os.getcwd())') + export hfds="$outdir/hostfile_deepspeed" + export hfmpi="$outdir/hostfile_mpich" + [ -f "$hfds" ] || exit + [ -f "$hfmpi" ] || exit + export LAUNCHER=${LAUNCHER:-MPICH} + if [[ $LAUNCHER == "deepspeed" ]]; then + export launcher="" + else + export launcher="--force_multi --hostfile $hfds --launcher=${LAUNCHER} --launcher_args='-hostfile ${hfmpi}'" + fi +} + +setParams() { + # [Parallelism Settings] {{{ + if [[ $(hostname) == x4* ]]; then # ---- [AURORA] ---- + TP=${TP:-1} # • TP = 1 + PP=${PP:-1} # • PP = 1 + export CCL=${CCL:-ccl} # • CCL + export BE="${CCL}" # • BE = CCL + export DTYPE=${DTYPE:-bf16} # • DTYPE: bf16 + elif [[ $(hostname) == x3* ]]; then # ---- [POLARIS] ---- + PP=${PP:-1} # • PP = 1 + TP=${TP:-2} # • TP = 2 + export NCCL=${NCCL:-nccl} # • NCCL + export BE="${NCCL}" # • BE = NCCL + export DTYPE=${DTYPE:-fp16} # • DTYPE: FP16 + fi + # }}} + export PP="${PP}" + export TP="${TP}" + export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" + export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} + # ---- Llama2 7B Config ------------------------------ + export MODEL_KEY="Llama-7B" + export HEADS=${HEADS:-32} + export NLAYERS=${NLAYERS:-32} + export HIDDEN=${HIDDEN:-4096} + export NUM_KV_HEAD=${NUM_KV_HEAD:-8} + export FFN_HIDDEN_SIZE=${FFN_HIDDEN_SIZE:-11008} + # ---- Run Settings ---------------------------------- + export LR=${LR:-0.0003} + export SEQ=${SEQ:-4096} # SEQ_LEN: 4096 + export ZERO_STAGE=${ZERO_STAGE:-2} + export MICRO_BATCH=${MICRO_BATCH:-8} + export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} + export EVAL_ITERS="${EVAL_ITERS:-10}" + export TRAIN_ITER=${TRAIN_ITER:-317892} + export EVAL_INTERVAL="${EVAL_INTERVAL:-50000}" + export SAVE_INTERVAL=${SAVE_INTERVAL:-200} + export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} + # export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} + export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) + export TOKENIZER_MODEL="${TOKENIZER_MODEL:-"/eagle/datasets/dolma/utils/tokenizer.model"}" + export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" + export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" + # ---------------------------------------------------- +} + + +setArgs() { + # ---- Set DeepSpeed arguments -------------------------------- + ds_args=" " + ds_args=" --deepspeed ${ds_args}" + if [[ $PP == 1 ]]; then + ds_args=" --no-pipeline-parallel ${ds_args}" + fi + ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" + ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" + if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" + ds_args=" --deepspeed-activation-checkpointing ${ds_args}" + # --checkpoint-activations \ + # --deepspeed-activation-checkpointing + fi + export ds_args + # --------------------------------------------------------------- + gpt_args=() + # we are now using activation checkpoint provided by megatron, see below. + # ds_args=" --deepspeed-activation-checkpointing ${ds_args}" + if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then + echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" + gpt_args+=( + "--checkpoint-activations" + "--checkpoint-num-layers 1" + ) + fi + export gpt_args +} + +ezpz() { + if [[ ! -d ezpz ]]; then + git clone https://github.com/saforem2/ezpz + else + echo "Found ezpz!" + fi + if python3 -c 'import ezpz; print(ezpz.__file__)' 2> '/dev/null'; then + echo "Has ezpz installed. Nothing to do." + else + echo "Does not have ezpz installed. Installing..." + echo "Using $(which python3) to install \`ezpz\`:" + python3 -m pip install -e ezpz > ezpz-install.log 2>&1 + fi + source ezpz/src/ezpz/bin/savejobenv > /tmp/savejobenv.log 2>&1 || exit + source ezpz/src/ezpz/bin/getjobenv || exit +} + +saveDSenv() { + echo "Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env" + { + echo "PATH=${PATH}" ; + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" ; + echo "http_proxy=${http_proxy}" ; + echo "https_proxy=${https_proxy}" ; + echo "CFLAGS=${CFLAGS}" ; + echo "PYTHONUSERBASE=$PYTHONUSERBASE" ; + } > .deepspeed_env +} + +setOutput() { + # ---- Specify output location -------------------------------- + export OUTPUT_PREFIX="ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" + # OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME} + OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" + export OUTPUT_DIR="${OUTPUT_DIR}" + export OUTPUT_LOG="${OUTPUT_DIR}/output.log" + export CKPT_DIR="checkpoints/${OUTPUT_PREFIX}" + echo "${OUTPUT_LOG}" >> "logs/latest" + mkdir -p "${OUTPUT_DIR}" + echo "!!!Please see logs at ${OUTPUT_DIR}" +} + +buildDSconfig() { + # ---- Build DeepSpeed Config --------------------------------- + export DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" + bash "${HERE}/generate_config.sh" "${DS_CONFIG}" || exit 1 + # ------------------------------------------------------------- +} + + +# makeDSenv() { +# saveDSenv +# } + + +# makeDSenv() { +# echo "PATH=${PATH}" > .deepspeed_env +# echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> .deepspeed_env +# echo "http_proxy=${http_proxy}" >> .deepspeed_env +# echo "https_proxy=${https_proxy}" >> .deepspeed_env +# echo "CFLAGS=${CFLAGS}" >> .deepspeed_env +# echo "PYTHONUSERBASE=$PYTHONUSERBASE" >> .deepspeed_env +# } + +sumWeights() { + local file_list=$1 + weights=$(cat "${file_list}" | awk '{print $1}' | tr '\n' '\ ,\ ' | sed 's/^/[/g' | sed 's/$/]/g' | tr '\ ' "\,\ ") + # weights=$(echo "$weights" | tr ",]" "]") + # echo "weights: $weights" + python3 -c "import numpy as np; print(np.sum(${weights}))" +} + +sumFiles() { + local rd=$1 + for f in $("${rd}/*.txt"); do + ws=$(sumWeights "${rd}/${f}") + echo "sum($f.weights)=${ws}" + done +} + +# setupData() { +# cidx=$1 +# echo "Caught DOLMA_CHUNK_IDX: ${cidx} !!" +# dfl="./chunks-reweighted/10/data_file_list_chunk_${cidx}_of_10.txt" +# if [[ -z "${DATA_FILE_LIST}" ]]; then +# DATA_FILE_LIST="${dfl}" +# else +# echo "Caught DATA_FILE_LIST: ${DATA_FILE_LIST} from ENV!!" +# fi +# NDOCS=$(wc -l < "${DATA_FILE_LIST}") && export NDOCS="${NDOCS}" +# WEIGHT_SUM="$(sumWeights "${DATA_FILE_LIST}")" +# export WEIGHT_SUM="${WEIGHT_SUM}" +# export NDOCS="${NDOCS}" +# echo "Using DATA_FILE_LIST: ${DATA_FILE_LIST} with ${NDOCS} documents" +# echo "WEIGHT SUM: ${WEIGHT_SUM}" +# data_file_list_stem=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") +# export DOLMA_CHUNK_IDX="${cidx}" +# export DATA_FILE_LIST_STEM="${data_file_list_stem}" +# export DATA_CACHE_PATH=".cache/${data_file_list_stem}/index-cache" +# mkdir -p "${DATA_CACHE_PATH}" +# } +# + + +setEnv() { + if [[ $(hostname) == x4* ]]; then + SETENV_FILE="${HOME}/anl_24_release_q4/llm.devkit/setenv.sh" + if [[ "${SETENV_FILE}" ]]; then + # shellcheck source=/home/foremans/anl_24_release_q4/llm.devkit/setenv.sh + source "${HOME}/anl_24_release_q4/llm.devkit/setenv.sh" + else + echo "Unable to source ${SETENV_FILE}, exiting!" + exit + fi + elif [[ $(hostname) == x3* ]]; then + # ---- load conda ----------------------------------- + module load conda/2023-10-04 ; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06 + export PYTHONUSERBASE="${HOME}/.local/polaris/conda/2024-03-06" + mkdir -p "${PYTHONUSERBASE}" + # if [[ "${VIRTUAL_ENV}" ]]; then + # echo "Caught VIRTUAL_ENV = ${VIRTUAL_ENV} from environment!!" + # else + # echo "Not using VIRTUAL_ENV" + # # sourceFile "${HERE}/venvs/polaris/2023-10-04/bin/activate" || exit + # fi + else + echo "Unknown hostname $(hostname)" + exit 1 + fi +} + +makeHostfiles() { + GPUS_PER_NODE=$(python3 -Wignore -c 'import ezpz; print(ezpz.get_gpus_per_node())') + export GPUS_PER_NODE="${GPUS_PER_NODE}" + # ---- Make MPICH hostfile ---------------- + export hostfile_mpich=hostfile_mpich + cat "$PBS_NODEFILE" > "${hostfile_mpich}" + # ---- Make DeepSpeed hostfile ------------------- + export hostfile_deepspeed=hostfile_deepspeed + cat "$PBS_NODEFILE" > "${hostfile_deepspeed}" + sed -e "s/$/ slots=${GPUS_PER_NODE}/" -i "${hostfile_deepspeed}" +} + + +setData() { # dfl: abbrv. for DATA_FILE_LIST + dfl="${1:-/eagle/datasets/dolma/data_file_list_reweighted.txt}" + # dfl_fallback="/eagle/datasets/dolma/data_file_list_reweighted.txt" + printf "Calling: \`setData()\` with %s\n" "${dfl}" + ndocs=$(wc -l < "${dfl}") + ws=$(sumWeights "${dfl}") + dfl_stem=$(echo "${dfl}" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") + dcp="${HERE}/.cache/${dfl_stem}/index-cache" + mkdir -p dcp + export DATA_FILE_LIST="${dfl}" + export NUM_DOCS="${ndocs}" + export WEIGHT_SUM="${ws}" + export DFL_STEM="${dfl_stem}" + export DATA_CACHE_PATH="${dcp}" + echo "--------------------" + echo "Updated environment:" + printf "DATA_FILE_LIST: %s\n" "${DATA_FILE_LIST}" + printf "NUM_DOCS: %s\n " "${NUM_DOCS}" + printf "WEIGHT_SUM: %s\n" "${WEIGHT_SUM}" + printf "DFL_STEM: %s\n" "${DFL_STEM}" + printf "DATA_CACHE_PATH: %s\n" "${DATA_CACHE_PATH}" + echo "--------------------" +} + +printBlack() { + printf "\e[1;30m%s\e[0m\n" "$@" +} + +printRed() { + printf "\e[1;31m%s\e[0m\n" "$@" +} + +printGreen() { + printf "\e[1;32m%s\e[0m\n" "$@" +} + +printYellow() { + printf "\e[1;33m%s\e[0m\n" "$@" +} + +printBlue() { + printf "\e[1;34m%s\e[0m\n" "$@" +} + +printMagenta() { + printf "\e[1;35m%s\e[0m\n" "$@" +} + +printCyan() { + printf "\e[1;36m%s\e[0m\n" "$@" +} +printWhite() { + printf "\e[1;37m%s\e[0m\n" "$@" +} diff --git a/ALCF/README.md b/ALCF/pre-AuroraGPT/README.md similarity index 100% rename from ALCF/README.md rename to ALCF/pre-AuroraGPT/README.md diff --git a/ALCF/args.sh b/ALCF/pre-AuroraGPT/args.sh similarity index 100% rename from ALCF/args.sh rename to ALCF/pre-AuroraGPT/args.sh diff --git a/ALCF/launch.sh b/ALCF/pre-AuroraGPT/launch.sh similarity index 100% rename from ALCF/launch.sh rename to ALCF/pre-AuroraGPT/launch.sh diff --git a/ALCF/llama2_vars.sh b/ALCF/pre-AuroraGPT/llama2_vars.sh similarity index 100% rename from ALCF/llama2_vars.sh rename to ALCF/pre-AuroraGPT/llama2_vars.sh diff --git a/ALCF/model.sh b/ALCF/pre-AuroraGPT/model.sh similarity index 100% rename from ALCF/model.sh rename to ALCF/pre-AuroraGPT/model.sh diff --git a/ALCF/setup.sh b/ALCF/pre-AuroraGPT/setup.sh similarity index 100% rename from ALCF/setup.sh rename to ALCF/pre-AuroraGPT/setup.sh diff --git a/ALCF/submit-pbs.sh b/ALCF/pre-AuroraGPT/submit-pbs.sh similarity index 100% rename from ALCF/submit-pbs.sh rename to ALCF/pre-AuroraGPT/submit-pbs.sh diff --git a/ALCF/submit.sh b/ALCF/pre-AuroraGPT/submit.sh similarity index 100% rename from ALCF/submit.sh rename to ALCF/pre-AuroraGPT/submit.sh diff --git a/ALCF/train-gpt3.sh b/ALCF/pre-AuroraGPT/train-gpt3.sh similarity index 100% rename from ALCF/train-gpt3.sh rename to ALCF/pre-AuroraGPT/train-gpt3.sh diff --git a/ALCF/train-llama.sh b/ALCF/pre-AuroraGPT/train-llama.sh similarity index 100% rename from ALCF/train-llama.sh rename to ALCF/pre-AuroraGPT/train-llama.sh diff --git a/ALCF_utils/test_blend.sh b/ALCF/test_blend.sh similarity index 100% rename from ALCF_utils/test_blend.sh rename to ALCF/test_blend.sh diff --git a/ALCF_utils/test_blend_full.sh b/ALCF/test_blend_full.sh similarity index 100% rename from ALCF_utils/test_blend_full.sh rename to ALCF/test_blend_full.sh diff --git a/ALCF_utils/test_blendable_dataset.py b/ALCF/test_blendable_dataset.py similarity index 100% rename from ALCF_utils/test_blendable_dataset.py rename to ALCF/test_blendable_dataset.py diff --git a/ALCF_utils/data_file_list_polaris.txt b/ALCF_utils/data_file_list_polaris.txt deleted file mode 100644 index bac6e0cb33..0000000000 --- a/ALCF_utils/data_file_list_polaris.txt +++ /dev/null @@ -1,3074 +0,0 @@ -0.00029986601436087147 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0553_text_document -0.00025354733193980704 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0299_text_document -0.00022796278454747796 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0366_text_document -0.00015898148098181938 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0753_text_document -0.00032353442734302674 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0429_text_document -0.0003246771202039335 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0372_text_document -0.0002262495314665641 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0124_text_document -0.0003123475130228927 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0437_text_document -0.0002463525690862687 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0053_text_document -0.0002851511545680644 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0615_text_document -0.00017894057315965558 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0182_text_document -0.0002929038657836376 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0713_text_document -0.00025554560858852976 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0688_text_document -0.00026670241496326607 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0166_text_document -0.0003428428123152775 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0768_text_document -0.0002463170207315496 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0692_text_document -0.0002760865339513081 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0041_text_document -0.00033407859155259556 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0416_text_document -0.00034039084474378353 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0630_text_document -0.0003366738865632568 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0639_text_document -0.00017446109057505982 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0225_text_document -0.000253028300825639 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0035_text_document -0.00024506403683874226 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0365_text_document -0.00024185734672422406 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0368_text_document -0.00018780072964724365 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0196_text_document -0.00032000780903059043 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0328_text_document -0.00035000304711647526 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0624_text_document -0.0002569785197146494 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0081_text_document -0.00023270677747769242 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0488_text_document -0.00017835163350772668 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0189_text_document -0.00022425534843704826 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0118_text_document -0.00025809915378253605 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0150_text_document -0.0003372508571793003 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0314_text_document -0.00017405443644933578 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0209_text_document -0.00018999333170612105 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0229_text_document -0.0002578638319368945 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0265_text_document -0.0003026297840378106 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0532_text_document -0.00024115277287793853 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0478_text_document -0.00021079670153911382 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0140_text_document -0.0002641088110552866 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0256_text_document -0.0002734180919243281 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0047_text_document -0.00029907507717544046 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0607_text_document -0.0002524517419857655 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0023_text_document -0.0002411050941613276 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0111_text_document -0.00028888961626426636 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0613_text_document -0.0002666480632163931 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0748_text_document -0.0002433081406689229 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0000_text_document -0.00022825376499345922 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0127_text_document -0.0002446050560023786 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0106_text_document -0.0002942835023242189 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0563_text_document -0.0002763945689252822 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0577_text_document -0.0002491076073033573 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0502_text_document -0.00028576551829606866 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0705_text_document -0.00028829382894385313 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0538_text_document -0.00025082670805714693 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0088_text_document -0.0002630381343492388 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0263_text_document -0.00027810544588847204 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0460_text_document -0.00027747466684446923 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0571_text_document -0.0003172910973599917 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0653_text_document -0.00037520258694866886 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0172_text_document -0.0003211385761767212 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0524_text_document -0.0003255969458125186 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0652_text_document -0.0003388825652663348 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0322_text_document -0.00028869956272163335 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0447_text_document -0.0003205690642373091 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0387_text_document -0.000293306015190877 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0612_text_document -0.00026896512005363497 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0290_text_document -0.000254210476816404 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0339_text_document -0.00022686273928809224 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0487_text_document -0.0002611478888145433 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0396_text_document -0.00017381296052422173 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0178_text_document -0.00023007865273755727 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0091_text_document -0.00018780228063321224 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0193_text_document -0.0003372571297149312 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0408_text_document -0.00021992069576697105 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0496_text_document -0.0001812811364369899 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0755_text_document -0.00033764559935372575 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0773_text_document -0.0003018753737677833 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0547_text_document -0.0003166294761680392 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0384_text_document -0.0002728688935238676 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0574_text_document -0.0003040110074396542 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0533_text_document -0.0002875528493458918 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0464_text_document -0.00022373112059479916 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0489_text_document -0.00024189392824227604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0050_text_document -0.00026046059377324407 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0060_text_document -0.0002480106883840328 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0114_text_document -0.0002569141409359981 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0033_text_document -0.00029923086394617387 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0561_text_document -0.00017765017246365572 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0208_text_document -0.00017213376798867627 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0233_text_document -0.0002807023783016899 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0744_text_document -0.0003444627799304048 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0326_text_document -0.00035341598497146246 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0313_text_document -0.00023246914923423715 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0482_text_document -0.0003151608790524258 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0436_text_document -0.00026075950491544447 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0588_text_document -0.0003108468104526031 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0080_text_document -0.0003070762114877851 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0660_text_document -0.0002572020155743189 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0038_text_document -0.000253216181543879 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0282_text_document -0.00027448432979607844 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0745_text_document -0.00033409492045200607 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0406_text_document -0.00023278669477113861 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0116_text_document -0.00026115519857486115 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0059_text_document -0.0002480831646000483 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0503_text_document -0.00023963680864181876 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0357_text_document -0.0002413187258946405 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0171_text_document -0.0003508576397082831 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0770_text_document -0.0002721719890933602 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0286_text_document -0.0002876539863114944 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0544_text_document -0.00024182215509176508 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0698_text_document -0.00025302595494243913 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0155_text_document -0.0002483407803459808 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0341_text_document -0.0002797174988271654 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0463_text_document -0.00026783545872394773 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0051_text_document -0.0002494040268048752 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0676_text_document -0.0002964792948022985 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0595_text_document -0.00023982416950674604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0174_text_document -0.00017222619019110588 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0198_text_document -0.00023022730483745666 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0480_text_document -0.00024958354880531006 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0687_text_document -0.0002600257919823223 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0145_text_document -0.00027443988098405596 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0004_text_document -0.0002660738793660898 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0583_text_document -0.000284693715918069 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0449_text_document -0.0001797115255501787 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0204_text_document -0.0002819480075336546 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0715_text_document -0.00024558907785431555 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0521_text_document -0.0003452086900121291 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0320_text_document -0.0002849104762477509 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0568_text_document -0.0002520087983082163 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0151_text_document -0.0001744206861947346 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0197_text_document -0.00028747991690444293 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0709_text_document -0.0002459413860995668 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0499_text_document -0.000259317580967894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0006_text_document -0.000263466262658637 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0269_text_document -0.00030976482490632654 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0525_text_document -0.00027110687283220773 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0413_text_document -0.0003179317321820123 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0656_text_document -0.00033906740854304013 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0646_text_document -0.00027369098806344534 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0246_text_document -0.0002986988698925429 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0535_text_document -0.0003395569327922032 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0333_text_document -0.000164638204335171 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0238_text_document -0.00027443002362662267 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0241_text_document -0.00023668635544354816 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0469_text_document -0.00024265468189599862 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0689_text_document -0.00033850556579377534 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0403_text_document -0.0003485968296908193 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0404_text_document -0.00024590976259223263 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0360_text_document -0.0001791680338033577 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0191_text_document -0.00017041559638243079 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0236_text_document -0.00025180889395144256 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0032_text_document -0.00028885663919789436 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0445_text_document -0.00029091485105272474 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0614_text_document -0.00023445115384250546 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0490_text_document -0.00032117096366987005 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0651_text_document -0.00028813277753017444 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0703_text_document -0.00028764255585627627 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0702_text_document -0.0003501536879260528 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0623_text_document -0.00027353701536416775 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0719_text_document -0.00028375403705731966 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0728_text_document -0.0002681400162763699 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0031_text_document -0.0002801025363961944 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0253_text_document -0.0003054975830412967 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0551_text_document -0.0003336419698238177 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0327_text_document -0.0002783342023297327 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0027_text_document -0.0002269098949925595 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0491_text_document -0.0002614046304668415 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0395_text_document -0.0002389768503487008 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0473_text_document -0.00030914674481377635 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0662_text_document -0.0003367403637081444 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0312_text_document -0.00029362370355206103 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0605_text_document -0.0002934194986726569 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0455_text_document -0.0002720662187997295 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0580_text_document -0.00026145129858319734 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0005_text_document -0.00034377973283976473 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0311_text_document -0.00024579413020292225 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0305_text_document -0.0002647178135703726 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0260_text_document -0.00029551321578672775 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0566_text_document -0.0003056359366755917 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0670_text_document -0.00023036150528601932 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0129_text_document -0.0002785463392591407 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0742_text_document -0.0003002481568613867 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0549_text_document -0.000247964769587491 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0058_text_document -0.00024596805385249104 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0501_text_document -0.00024541585230637415 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0071_text_document -0.00029445519245471536 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0450_text_document -0.0003371086021632213 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0375_text_document -0.00022005213043345582 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0131_text_document -0.0002415459805374422 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0697_text_document -0.00024822493157675423 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0415_text_document -0.00030080408063844975 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0560_text_document -0.0003279134620384162 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0643_text_document -0.00023726361757455696 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0699_text_document -0.00023085162610295972 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0515_text_document -0.0002745291533808234 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0739_text_document -0.0002506138351201408 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0092_text_document -0.00024556923836562873 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0046_text_document -0.00023090386526885743 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0083_text_document -0.00029727809725102304 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0443_text_document -0.00026952806591177387 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0746_text_document -0.00031532525062300405 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0655_text_document -0.00033089961611781194 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0427_text_document -0.0002903020283829348 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0603_text_document -0.00022808149386370916 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0367_text_document -0.00033365210425584645 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0318_text_document -0.00023152652972755692 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0520_text_document -0.0002715873565073543 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0749_text_document -0.00034308166550822705 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0771_text_document -0.0002506620824263125 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0369_text_document -0.0003155532825638363 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0434_text_document -0.00029047172619366075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0602_text_document -0.0002491292800421161 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0349_text_document -0.0002559332562543997 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0763_text_document -0.00028216899144587154 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0731_text_document -0.00021527310967872735 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0338_text_document -0.00029197280872618074 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0462_text_document -0.0002497346241797662 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0347_text_document -0.00032116368527223036 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0649_text_document -0.00017893692809443551 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0194_text_document -0.0002377747166387212 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0134_text_document -0.00027551701163288023 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0734_text_document -0.000333287766731292 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0632_text_document -0.00025605546217079896 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0280_text_document -0.00017630373365634742 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0184_text_document -0.00023722922106495997 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0089_text_document -0.0002263994556727904 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0095_text_document -0.00030174055734719644 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0555_text_document -0.0002847362655958324 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0016_text_document -0.00025890418522015135 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0168_text_document -0.0003082976431725841 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0665_text_document -0.000346081899625068 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0767_text_document -0.0003209875815780836 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0666_text_document -0.0002741000975032965 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0737_text_document -0.00025522276682037417 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0037_text_document -0.0003187901431234778 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0648_text_document -0.0002545353756603635 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0064_text_document -0.0002638188827256236 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0764_text_document -0.0003507452430899613 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0323_text_document -0.0002507240659086237 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0009_text_document -0.0003133434303550815 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0545_text_document -0.00017501628240587877 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0212_text_document -0.0002643006640033749 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0015_text_document -0.00026580807028971245 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0267_text_document -0.0002871314019638329 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0727_text_document -0.00030829637332256503 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0661_text_document -0.00017563429413230326 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0211_text_document -0.00017132261428552822 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0220_text_document -0.0002591872665582224 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0278_text_document -0.0002831032389225307 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0721_text_document -0.0002771475730643005 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0718_text_document -0.00016968487666721077 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0207_text_document -0.00023341688807764153 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0619_text_document -0.00028239455769356076 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0400_text_document -0.00017479168321170443 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0754_text_document -0.0002893192781860632 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0610_text_document -0.0002451308928177063 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0358_text_document -0.00016439582430752915 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0758_text_document -0.00025727386724434066 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0298_text_document -0.0001644915605748274 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0756_text_document -0.0002776780120706089 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0729_text_document -0.00023809989233164537 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0468_text_document -0.00025872266999742285 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0397_text_document -0.00026660487071654296 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0247_text_document -0.00025841743419888175 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0149_text_document -0.0002523747545079728 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0119_text_document -0.00024845115079991526 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0010_text_document -0.00024496676763593767 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0093_text_document -0.00029822689182083806 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0386_text_document -0.0002800915174915155 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0045_text_document -0.0002542209027633981 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0066_text_document -0.00027882609660458894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0393_text_document -0.00029269610879889394 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0600_text_document -0.0003170204421857625 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0440_text_document -0.00023451182731251905 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0350_text_document -0.0001713437694272821 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0214_text_document -0.00028770953412712207 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0714_text_document -0.0002521884552358564 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0161_text_document -0.0003496380818870961 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0775_text_document -0.00017772896273136538 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0203_text_document -0.0002500155180274436 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0077_text_document -0.0003263419003478976 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0332_text_document -0.0003298955338846564 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0700_text_document -0.00022675950192557637 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0123_text_document -0.0002502100722272073 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0024_text_document -0.0002807683240860951 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0013_text_document -0.00026633530353392567 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0587_text_document -0.0002526947984544801 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0148_text_document -0.00023452398475010418 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0513_text_document -0.0002494133638577342 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0674_text_document -0.00018960923298675975 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0188_text_document -0.0002979177307236505 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0599_text_document -0.00024756537851651375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0158_text_document -0.00032700344933800113 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0425_text_document -0.00024354258639025316 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0003_text_document -0.00029952785677549897 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0534_text_document -0.0002633271590276356 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0254_text_document -0.00023174553147338835 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0121_text_document -0.00023240024227150026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0099_text_document -0.00031612398651064907 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0373_text_document -0.00022503402642541325 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0479_text_document -0.0003113968430100502 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0379_text_document -0.000250330440380919 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0344_text_document -0.00024438294664384054 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0684_text_document -0.0002809897197744821 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0720_text_document -0.0002705284913469089 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0391_text_document -0.00027569439555606186 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0575_text_document -0.0003410636080348197 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0319_text_document -0.0003203709003963575 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0336_text_document -0.00029253485792760743 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0531_text_document -0.00024432311184342917 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0474_text_document -0.0003191646877331716 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0432_text_document -0.000346686134561813 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0766_text_document -0.0002490456434509238 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0342_text_document -0.0002308722169421376 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0476_text_document -0.00016206769238354457 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0237_text_document -0.0002492309682237096 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0061_text_document -0.00027812802408354336 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0250_text_document -0.00018887116876408207 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0752_text_document -0.00033632001244040556 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0329_text_document -0.0003186771831051032 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0376_text_document -0.0003352297812173437 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0640_text_document -0.00033654919915956005 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0634_text_document -0.00024490246128476605 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0682_text_document -0.00017628521613838942 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0181_text_document -0.0002723200629015754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0076_text_document -0.0002731843102343778 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0244_text_document -0.00024055053612736437 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0690_text_document -0.000260910862554004 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0303_text_document -0.00017227027731699112 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0228_text_document -0.00022621021101649683 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0477_text_document -0.00016795045761426564 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0224_text_document -0.00016929858707299419 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0199_text_document -0.000253504280684824 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0343_text_document -0.0002664008409266226 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0399_text_document -0.00030071197680104803 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0707_text_document -0.0002017638129580961 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0760_text_document -0.0003404886940433186 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0774_text_document -0.00026063745107418964 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0270_text_document -0.0002583694107184419 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0144_text_document -0.0002910352929877009 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0451_text_document -0.00017621100910867517 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0180_text_document -0.00024697352709939353 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0025_text_document -0.00024146432588700466 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0363_text_document -0.00023853893101731814 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0516_text_document -0.0003278831887056581 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0647_text_document -0.0002658199908149806 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0581_text_document -0.00024246994827585146 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0679_text_document -0.00033843099030900046 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0635_text_document -0.00017307547662390532 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0201_text_document -0.0002069914418510578 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0133_text_document -0.0002469419507488919 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0351_text_document -0.0003273613879736889 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0325_text_document -0.00017641368782149634 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0183_text_document -0.00025246502306462557 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0287_text_document -0.00024313653131424496 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0683_text_document -0.000342460619273468 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0316_text_document -0.00025368942675803356 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0275_text_document -0.00031652716411734427 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0424_text_document -0.0002825702051622742 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0461_text_document -0.0002837151182300374 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0576_text_document -0.0002887102616818689 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0390_text_document -0.00024213267448623674 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0052_text_document -0.00023430817410036728 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0086_text_document -0.0002241156128142375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0492_text_document -0.00016148562048123923 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0216_text_document -0.00034595368155095567 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0772_text_document -0.00031905795534990306 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0439_text_document -0.00026930584215843127 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0249_text_document -0.00022527795969533147 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0493_text_document -0.0002608207889423435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0593_text_document -0.00029249050004478847 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0442_text_document -0.00017081784506721255 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0218_text_document -0.00023511433888138094 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0484_text_document -0.00024698036957836997 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0346_text_document -0.0002460253656495418 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0157_text_document -0.00024076940542792902 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0352_text_document -0.00030382220501968567 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0441_text_document -0.00023755063481592102 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0486_text_document -0.00028417867237664216 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0537_text_document -0.000225984786690567 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0485_text_document -0.00024807815243794377 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0164_text_document -0.0002503897564561716 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0022_text_document -0.00029213581748685935 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0458_text_document -0.00023531327437959358 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0497_text_document -0.00025057009253245374 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0170_text_document -0.00026922193390778215 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0154_text_document -0.0002748674817686949 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0751_text_document -0.00023953538828395883 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0048_text_document -0.00032242897439078245 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0428_text_document -0.00022552802583542902 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0418_text_document -0.00022344225002222384 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0112_text_document -0.00016663075921299214 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0757_text_document -0.00032178194519251073 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0421_text_document -0.00023947291743916702 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0471_text_document -0.00023146797086919565 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0510_text_document -0.00021323599497854087 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0466_text_document -0.0003337629126591212 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0641_text_document -0.00029173993384632753 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0601_text_document -0.00027480517202427865 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0740_text_document -0.00027104181603306824 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0594_text_document -0.000259327422305542 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0276_text_document -0.00031194191484921015 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0383_text_document -0.00016906057458032387 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0232_text_document -0.00027723921638928413 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0717_text_document -0.0003242034882586253 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0644_text_document -0.00025144344474404445 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0518_text_document -0.0002742433658271999 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0743_text_document -0.00024688821993792026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0673_text_document -0.00024027301297788078 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0044_text_document -0.0003140105907826361 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0667_text_document -0.00024234707516924134 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0308_text_document -0.0002517457940480414 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0675_text_document -0.00027377464202820747 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0572_text_document -0.00027127850723190206 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0579_text_document -0.0002801181272573297 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0723_text_document -0.0003044955357721288 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0381_text_document -0.0001773914653416496 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0759_text_document -0.000244260849707107 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0504_text_document -0.00028690435806017796 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0708_text_document -0.0002549618760436977 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0049_text_document -0.0003338929078012418 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0642_text_document -0.00025438827048507865 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0074_text_document -0.00024788324580290473 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0039_text_document -0.0002657964126243008 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0401_text_document -0.00024963314944488873 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0409_text_document -0.0002474347556695685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0014_text_document -0.00023102994458882423 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0098_text_document -0.0002629133147259061 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0146_text_document -0.0002878014380556544 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0616_text_document -0.0002484505616779537 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0101_text_document -0.0002979288985446429 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0446_text_document -0.00029660556996978065 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0565_text_document -0.00026095434544066553 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0295_text_document -0.0002831348173037093 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0730_text_document -0.0002365956782695563 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0498_text_document -0.00032328415452513646 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0638_text_document -0.00024548967283056556 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0301_text_document -0.00022916761725282506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0139_text_document -0.00017135825898458961 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0192_text_document -0.00025038733972490736 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0001_text_document -0.0002612597204533967 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0268_text_document -0.00030775483508702073 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0527_text_document -0.0002450825999341174 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0359_text_document -0.00035290511311749176 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0315_text_document -0.00026505364760999234 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0251_text_document -0.0003160354002570778 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0546_text_document -0.00027593535501232487 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0262_text_document -0.0003183575069367642 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0659_text_document -0.0002947758106315529 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0567_text_document -0.0001741629156660883 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0190_text_document -0.00025352755314579954 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0078_text_document -0.00024964796075168576 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0175_text_document -0.00025506541936154865 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0054_text_document -0.00025555970306634487 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0008_text_document -0.00029524207821389497 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0452_text_document -0.000199634928805321 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0187_text_document -0.00025285663055065095 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0011_text_document -0.00021297141649242722 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0138_text_document -0.00022778759217604392 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0087_text_document -0.0001953318184154979 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0206_text_document -0.0002870975629878886 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0611_text_document -0.00024458163981459747 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0509_text_document -0.0001787757081345656 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0205_text_document -0.00023554937871277356 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0620_text_document -0.0002520185236716444 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0677_text_document -0.00022301112872055435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0132_text_document -0.00025561244200441604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0296_text_document -0.0002253012068856598 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0495_text_document -0.00029101930659581156 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0444_text_document -0.0003009753416192063 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0598_text_document -0.00024953859390345224 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0691_text_document -0.00035416563691401055 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0761_text_document -0.00021080283440929313 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0417_text_document -0.00033863782739706187 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0317_text_document -0.0002768289718009883 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0578_text_document -0.0003211028360358407 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0374_text_document -0.00025217196207723947 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0055_text_document -0.00022020631342061864 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0481_text_document -0.00024394187795805726 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0307_text_document -0.0002774350481105133 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0736_text_document -0.00021927408285748486 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0136_text_document -0.00030251214218079053 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0550_text_document -0.00023376098042937013 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0084_text_document -0.00024484618716366375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0511_text_document -0.00032440721841087237 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0380_text_document -0.00023199362334417763 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0356_text_document -0.00035580287272947756 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0310_text_document -0.0002526262802535738 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0110_text_document -0.0003063445757116061 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0668_text_document -0.00024646752300382937 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0306_text_document -0.00022331705652869183 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0115_text_document -0.0003421795194248901 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0324_text_document -0.00017598828219869743 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0202_text_document -0.00025019952804968546 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0294_text_document -0.0002964977996849329 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0704_text_document -0.00034193340092088606 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0629_text_document -0.0003000100913454835 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0608_text_document -0.00033314792957463103 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0627_text_document -0.00027662517486203096 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0725_text_document -0.0002308977883551111 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0472_text_document -0.00016969055286069522 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0230_text_document -0.00034668143602649003 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0407_text_document -0.0003072554613775016 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0556_text_document -0.00024691426562678927 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0505_text_document -0.00025896543472554137 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0040_text_document -0.00029337350699737376 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0606_text_document -0.00024039263045799383 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0096_text_document -0.00025688438655402954 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0281_text_document -0.00017826393466000563 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0179_text_document -0.0002987145843993446 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0557_text_document -0.0002545665977705435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0288_text_document -0.0003407221032168543 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0769_text_document -0.00033154596910064173 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0370_text_document -0.00026418756319139476 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0017_text_document -0.00024071241177027537 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0694_text_document -0.0003034159035548678 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0385_text_document -0.00021807629186309122 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0130_text_document -0.00029528370867186834 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0562_text_document -0.00024176561585449852 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0506_text_document -0.0002435688410132227 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0036_text_document -0.00017344067673492444 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0217_text_document -0.00025145685285303255 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0289_text_document -0.00027990862231094815 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0712_text_document -0.0002803072024649089 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0724_text_document -0.0002913097590657721 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0564_text_document -0.00022731318225581286 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0105_text_document -0.00022996707132323673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0120_text_document -0.00021632449585577137 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0141_text_document -0.00031827065104563006 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0431_text_document -0.0002819283047772193 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0142_text_document -0.0002815277519333656 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0570_text_document -0.00023765405087801319 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0512_text_document -0.00018998873335086353 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0227_text_document -0.00024146656166238478 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0411_text_document -0.0003004350285587871 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0389_text_document -0.000278065966594337 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0735_text_document -0.000281790862465637 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0585_text_document -0.0002439702643406182 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0122_text_document -0.0002537403759927729 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0042_text_document -0.0002423193871139386 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0309_text_document -0.0003495948469686597 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0765_text_document -0.00033236219927066605 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0636_text_document -0.0002858695230288609 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0539_text_document -0.0002348134342912561 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0467_text_document -0.0002691883546328634 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0586_text_document -0.00026279179878842626 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0750_text_document -0.00018511392917200416 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0200_text_document -0.00030242676078679127 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0671_text_document -0.00030050997791376127 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0530_text_document -0.0002635999306185633 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0012_text_document -0.00024458976049193177 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0082_text_document -0.0002518674158063562 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0160_text_document -0.0003223699699116345 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0438_text_document -0.00018505528728151655 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0195_text_document -0.000178243040573062 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0185_text_document -0.00016914020058443556 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0215_text_document -0.00025069619779996 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0173_text_document -0.0002815996313559731 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0710_text_document -0.0002412312344016045 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0348_text_document -0.00026767326358785484 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0590_text_document -0.0002457918957813268 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0073_text_document -0.00024745396271518434 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0030_text_document -0.000261259507683573 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0079_text_document -0.0002548186120067791 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0072_text_document -0.00027399569205024244 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0019_text_document -0.00021413864579792835 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0239_text_document -0.0002472353073125973 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0410_text_document -0.0002940208200391643 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0453_text_document -0.00028793368048032474 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0543_text_document -0.00024228067857454152 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0007_text_document -0.0002757861162724024 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0733_text_document -0.00022515235419999868 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0125_text_document -0.0002789873312304594 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0569_text_document -0.00032181605988926585 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0331_text_document -0.00025600745719208296 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0062_text_document -0.0002429596099341584 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0043_text_document -0.00021272272444376823 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0433_text_document -0.00016894488939341143 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0235_text_document -0.00029657717456198863 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0448_text_document -0.0002464749943157843 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0696_text_document -0.0002990022489754111 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0559_text_document -0.0002686072777622895 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0392_text_document -0.000310214707858456 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0664_text_document -0.00023147049770827952 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0483_text_document -0.00027094316845184026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0591_text_document -0.0002630382106124418 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0272_text_document -0.00028196904286546527 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0271_text_document -0.00025700449997059283 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0340_text_document -0.00025554555240646604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0159_text_document -0.00025089565112080837 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0153_text_document -0.00028712055905116097 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0541_text_document -0.00024771562469786036 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0028_text_document -0.00025202328267601593 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0067_text_document -0.0001684887169833427 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0222_text_document -0.0002607742259699303 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0165_text_document -0.00022397819464514847 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0117_text_document -0.00030314913238885265 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0669_text_document -0.00023619497538123523 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0103_text_document -0.0002734489760396025 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0258_text_document -0.00023077416637994847 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0097_text_document -0.00022084705451473572 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0419_text_document -0.00029499302425346795 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0459_text_document -0.00029225502792904867 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0609_text_document -0.0002294695004036345 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0104_text_document -0.00021262811723288358 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0430_text_document -0.00027993786496231197 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0582_text_document -0.00029708818969150343 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0457_text_document -0.00029628163028225886 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0529_text_document -0.000262417089919526 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0029_text_document -0.0003298930375057276 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0426_text_document -0.0002575102330835925 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0279_text_document -0.00028844680503198393 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0596_text_document -0.00033707683763665075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0631_text_document -0.0002473741577858656 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0517_text_document -0.00023231405560125397 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0507_text_document -0.0002670076117276825 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0252_text_document -0.0003441336056088313 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0626_text_document -0.00028441868553742185 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0056_text_document -0.00031718446831561955 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0335_text_document -0.0002803626135482851 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0542_text_document -0.0002173887650989829 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0672_text_document -0.0002539392289101208 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0284_text_document -0.00026280233213567066 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0257_text_document -0.00031105597221457113 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0654_text_document -0.0002443105227741655 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0678_text_document -0.00024790807827507997 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0018_text_document -0.0003036707221560443 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0528_text_document -0.00022103058913292817 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0422_text_document -0.000244961408904958 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0068_text_document -0.00027503974364758305 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0245_text_document -0.0002854117937664233 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0617_text_document -0.0002620533561829337 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0255_text_document -0.0002596756834115267 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0176_text_document -0.0002881031625880268 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0732_text_document -0.0001650387853828719 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0221_text_document -0.0003234602042935272 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0371_text_document -0.00021389341379725142 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0137_text_document -0.00036936242757777487 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0177_text_document -0.00031381814871258624 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0382_text_document -0.0002539766707583296 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0291_text_document -0.00029021877762037306 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0597_text_document -0.0003421392407209083 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0321_text_document -0.00028701185284984564 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0292_text_document -0.0002666537457275393 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0273_text_document -0.0002818729480708607 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0716_text_document -0.00022785209460036168 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0094_text_document -0.0002464632290244915 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0026_text_document -0.0002557240597452918 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0021_text_document -0.0002650994330587443 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0242_text_document -0.000165454036266872 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0226_text_document -0.0002302794555560894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0100_text_document -0.00024098227294979652 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0519_text_document -0.00030313866988967254 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0552_text_document -0.00024472340746819576 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0057_text_document -3.716455926414704e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0776_text_document -0.0002289064733184156 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0109_text_document -0.00017047069361339454 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0219_text_document -0.00028965044054042247 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0706_text_document -0.00024215674906462835 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0508_text_document -0.00027868463874204896 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0722_text_document -0.0002863276814410383 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0456_text_document -0.00033593485551269495 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0633_text_document -0.00022535057625369098 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0618_text_document -0.00025951432987823613 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0354_text_document -0.00023660867076665115 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0514_text_document -0.00023164910184270907 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0475_text_document -0.0002717934880479726 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0034_text_document -0.00025551605524358457 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0300_text_document -0.00028788385285618325 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0454_text_document -0.00023790235664678007 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0693_text_document -0.0003083326960421146 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0658_text_document -0.00026621224495881597 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0304_text_document -0.00023989172020287585 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0090_text_document -0.0003075436157979873 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0548_text_document -0.00025027414775809285 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0063_text_document -0.00028423970956347 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0465_text_document -0.0003385868271697667 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0414_text_document -0.0002461873511746418 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0169_text_document -0.0003497705905560846 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0762_text_document -0.0002837602935731654 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0701_text_document -0.00021285518550466257 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0135_text_document -0.00025887155855837775 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0143_text_document -0.00024612669665761364 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0167_text_document -0.00024926275847218885 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0523_text_document -0.0002831882638199006 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0540_text_document -0.0003218801435428595 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0334_text_document -0.0002504527051869514 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0297_text_document -0.00027958478618795354 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0738_text_document -0.00024632955113681093 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0500_text_document -0.0002959791358976716 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0554_text_document -0.00023808022251730368 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0695_text_document -0.0002693302215640041 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0747_text_document -0.00024118244197147295 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0108_text_document -0.00024254119141966469 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0113_text_document -0.00024082979776199307 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0686_text_document -0.0002440387750957088 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0302_text_document -0.00024571556691977537 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0362_text_document -0.00024313649385953578 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0355_text_document -0.00033420558196964426 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0645_text_document -0.00022171807844232732 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0102_text_document -0.00025367115675703917 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0681_text_document -0.00029828158945969385 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0536_text_document -0.00026580908134122904 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0156_text_document -0.00026144782973561495 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0163_text_document -0.00026560354050198784 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0277_text_document -0.00027259447866398304 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0573_text_document -0.0002454669644465353 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0293_text_document -0.0002749898352476231 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0741_text_document -0.0003182032271539976 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0377_text_document -0.00032183101559976087 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0423_text_document -0.00026140406796931173 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0592_text_document -0.00030798312041121676 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0657_text_document -0.00027046310241926396 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0584_text_document -0.00022915168410979936 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0628_text_document -0.0003144797967546013 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0240_text_document -0.00024793054681359315 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0361_text_document -0.00025289392596448485 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0680_text_document -0.0003249156807537638 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0388_text_document -0.0003200320573446812 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0435_text_document -0.00024932773314172515 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0345_text_document -0.000352339377109151 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0637_text_document -0.0002181818784932686 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0420_text_document -0.0003065464559171036 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0378_text_document -0.00025564864920100716 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0398_text_document -0.00017648320872558042 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0234_text_document -0.0002556592118802528 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0266_text_document -0.000180741470711273 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0210_text_document -0.000283307650028466 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0711_text_document -0.00024948381052490083 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0162_text_document -0.0002657201622822601 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0264_text_document -0.0002773843344343063 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0070_text_document -0.00017077838788489376 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0223_text_document -0.0002368054569060717 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0470_text_document -0.00025662519161828116 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0002_text_document -0.0002648571396733212 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0069_text_document -0.0002376373641281886 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0625_text_document -0.00025775792581353527 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0285_text_document -0.0002449626349826061 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0065_text_document -0.0002500622157014314 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0147_text_document -0.00023556842149918296 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0085_text_document -0.0002680577361774155 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0394_text_document -0.00033360487612389944 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0330_text_document -0.0003496319072752229 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0621_text_document -0.00026102956713198886 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0248_text_document -0.0002591069584722685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0274_text_document -0.00031214502293740203 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0526_text_document -0.00023818435518274705 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0685_text_document -0.00027077616083688154 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0589_text_document -0.00021454413499194508 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0128_text_document -0.00034352014327578543 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0405_text_document -0.00023102558540219145 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0126_text_document -0.0002447224377071115 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0075_text_document -0.00024328280481448173 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0494_text_document -0.0002720049988519714 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0283_text_document -0.00030256682583053806 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0558_text_document -0.0002627148374838038 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0402_text_document -0.00026820426193286214 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0261_text_document -0.00022606610063852957 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0107_text_document -0.00023977894226853914 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0522_text_document -0.0002971202576914705 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0604_text_document -0.0001695977590443741 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0231_text_document -0.0003090810731707665 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0663_text_document -0.0002808357894937908 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0726_text_document -0.0002470374776308281 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0152_text_document -0.00024230275654617777 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0353_text_document -0.0002875365676033139 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0243_text_document -0.00023312322239977016 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0364_text_document -0.00017547132733894936 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0213_text_document -0.00026021642426790116 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0412_text_document -0.00034525313882415157 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0622_text_document -0.0003216956035139199 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0650_text_document -0.00032148593882094746 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0354_text_document -0.00031099927571265226 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0200_text_document -0.00026288736854021024 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0391_text_document -0.00030120157866719887 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0559_text_document -0.00033262173175005845 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0473_text_document -0.0003282483062358124 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0528_text_document -0.000365895582017286 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0073_text_document -0.00019615092289600435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0170_text_document -0.00019615732562059426 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0180_text_document -0.00029826391241638425 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0582_text_document -0.00022144916152761252 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0067_text_document -0.00025039196312518126 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0036_text_document -0.00023453794655702172 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0023_text_document -0.00024046718292842934 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0099_text_document -0.0003548582198382252 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0342_text_document -0.00032576405247938887 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0353_text_document -0.00025884065661827183 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0044_text_document -0.00028762290770248925 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0575_text_document -0.00022250703391263435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0107_text_document -0.00028658440278656646 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0566_text_document -0.0002242413543649295 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0101_text_document -0.00025523457182268395 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0542_text_document -0.00032490234484662474 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0217_text_document -0.000336341248357978 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0437_text_document -0.0002806238955687308 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0583_text_document -0.0001944036099466144 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0172_text_document -0.00040165432728360835 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0060_text_document -0.00023820268565279 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0072_text_document -0.00027589339829224633 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0393_text_document -0.0003664961403005019 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0414_text_document -0.00032640050094795284 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0355_text_document -0.00026267086768592317 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0082_text_document -0.00027675886876949677 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0281_text_document -0.0002507184125038478 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0051_text_document -0.0002955728326278075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0133_text_document -0.00033895534360196215 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0471_text_document -0.0003249583290321772 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0346_text_document -0.00025269892975402695 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0034_text_document -0.0003597803812098366 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0300_text_document -0.00029054067647750703 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0282_text_document -0.00031819275856278644 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0309_text_document -0.00022047319830454594 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0125_text_document -0.0003339271619378178 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0434_text_document -0.00032250687905768815 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0460_text_document -0.00032737840955958814 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0186_text_document -0.00036817332427086083 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0504_text_document -0.00037520434935067656 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0487_text_document -0.00023497869981097718 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0132_text_document -0.00031875608212684787 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0225_text_document -0.00027112709318564797 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0401_text_document -0.00033378436021836355 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0477_text_document -0.0003539127090987545 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0607_text_document -0.00029769152503833615 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0562_text_document -0.0002635535461200955 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0497_text_document -0.00019676121443972563 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0174_text_document -0.000280048968941906 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0569_text_document -0.0002846942777387065 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0591_text_document -0.00024086454535308076 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0110_text_document -0.00033123896164923103 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0455_text_document -0.00019918331774706222 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0166_text_document -0.00030299320911149845 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0453_text_document -0.00027898564892737796 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0581_text_document -0.00034741102397772714 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0341_text_document -0.00027420078298171223 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0544_text_document -0.000359960106052341 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0606_text_document -0.0003529909755563099 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0291_text_document -0.00026046023186147605 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0086_text_document -0.0002867716530924125 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0578_text_document -0.0001108817386927782 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0083_text_document -0.0003214305663204344 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0357_text_document -0.0003059007996821165 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0188_text_document -0.0003700321466583531 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0411_text_document -0.00031683945155459305 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0218_text_document -0.0002275568413843357 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0003_text_document -0.0002232470277297263 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0001_text_document -0.0002588269802342886 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0543_text_document -0.00030658288378059576 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0307_text_document -0.0001937432554458501 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0169_text_document -0.00021412867224064434 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0149_text_document -0.00029445648351402766 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0561_text_document -0.0003245137199738549 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0310_text_document -0.00028866486070796286 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0222_text_document -0.0002520967807738254 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0547_text_document -0.0003017454528581887 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0183_text_document -0.0003205557761638949 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0214_text_document -0.00022447860828692386 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0111_text_document -0.0002281520513155878 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0127_text_document -0.0002404263491727334 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0054_text_document -0.0002344960672283001 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0007_text_document -0.00029308796650945226 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0572_text_document -0.00024115117947338366 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0103_text_document -0.0003554380548369965 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0334_text_document -0.00022154415907974328 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0114_text_document -0.0003358495217267899 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0513_text_document -0.0002547322407705563 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0237_text_document -0.00025785286289661753 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0397_text_document -0.00033317531686410663 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0306_text_document -0.0002403263232130821 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0120_text_document -0.00023805262922692774 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0129_text_document -0.00025828449660578406 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0262_text_document -0.0003388433440938099 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0459_text_document -0.0001148703143628027 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0085_text_document -0.0003139139975696427 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0206_text_document -0.0002822891138876659 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0271_text_document -0.00035623232436338923 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0610_text_document -0.00023526449481041633 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0031_text_document -0.0002605136896592062 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0390_text_document -0.00023442874329110232 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0043_text_document -0.00024393396170220794 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0012_text_document -0.00036856641866371385 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0409_text_document -0.0002976197958858269 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0574_text_document -0.00020244627431123625 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0596_text_document -0.0002892290973997832 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0588_text_document -0.00027153487772157026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0532_text_document -0.00025976541496692754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0236_text_document -0.00037566485973808487 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0501_text_document -0.0002799158826581256 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0269_text_document -0.0002772943564900957 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0540_text_document -0.00019716161047018854 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0175_text_document -0.0002423031004074836 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0290_text_document -0.00028139406975870025 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0233_text_document -0.00034787707721083003 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0440_text_document -0.0002619461653179241 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0071_text_document -0.00024498614841725265 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0037_text_document -0.0003610826239115939 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0420_text_document -0.0002600795555097568 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0404_text_document -0.0002040950865650348 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0141_text_document -0.00029488273028836727 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0592_text_document -0.00019996210584324688 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0164_text_document -0.00019384504696633997 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0162_text_document -0.00028397293275668543 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0388_text_document -0.00019716886585348885 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0159_text_document -0.00027827065793369874 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0372_text_document -0.0003200957118485349 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0476_text_document -0.0002983546202761619 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0323_text_document -0.0002416269472396373 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0008_text_document -0.000196846988205648 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0151_text_document -0.00027146549828798404 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0533_text_document -0.00033082151018825224 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0344_text_document -0.00031507825757410413 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0481_text_document -0.00031601467832641786 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0204_text_document -0.00020437098539854683 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0179_text_document -0.0003706225395981237 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0496_text_document -0.0003443040119002004 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0469_text_document -0.00022162180585066958 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0055_text_document -0.0002886852383260554 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0367_text_document -0.00028528967629728904 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0277_text_document -0.0002760425208323181 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0603_text_document -0.00035602168449419384 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0512_text_document -0.00033735094405926163 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0340_text_document -0.00020409686476119691 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0143_text_document -0.00019850542947853238 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0140_text_document -0.00026020939632042517 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0285_text_document -0.0002264343023157897 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0124_text_document -0.00027023502738295313 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0531_text_document -0.0002705793532625371 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0375_text_document -0.00022691069665871407 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0013_text_document -0.0003333799802252519 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0522_text_document -0.000243481939995933 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0066_text_document -0.00035504218541120377 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0332_text_document -0.00027735232825717244 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0228_text_document -0.0003184950331829959 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0445_text_document -0.0003508179986368801 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0430_text_document -0.00025129465094801635 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0018_text_document -0.00027981836842029604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0392_text_document -0.0003739220447382562 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0505_text_document -0.000398980676570116 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0485_text_document -0.00029607647166155567 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0130_text_document -0.0002406963619761374 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0026_text_document -0.0002572220385008283 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0489_text_document -0.00023570904655852102 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0006_text_document -0.00020195852509940438 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0157_text_document -0.00032680028348833006 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0467_text_document -0.0003340943214022902 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0454_text_document -0.0003653992024281921 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0600_text_document -0.00019009456253412738 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0163_text_document -0.0002493407030757422 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0248_text_document -0.0003440124023956098 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0339_text_document -0.00025712407819095047 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0534_text_document -0.00024028802893179358 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0038_text_document -0.00020495827636638026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0597_text_document -0.000351176486232251 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0303_text_document -0.0003456182915527073 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0425_text_document -0.00022200464825785848 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0000_text_document -0.0003315535250241385 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0352_text_document -0.00028612336058741426 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0226_text_document -0.00030129740450991597 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0461_text_document -0.00025487427602225745 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0545_text_document -0.00022872773425678075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0102_text_document -0.0002835592166132379 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0587_text_document -0.00033564827847697897 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0048_text_document -0.00033858047476084023 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0336_text_document -0.00037381384462677615 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0502_text_document -0.0003544715223035618 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0427_text_document -0.00022837425047517556 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0090_text_document -0.00026727566447651724 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0369_text_document -0.00033598278535636194 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0216_text_document -0.00036767226126370613 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0292_text_document -0.0002476495487244962 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0243_text_document -0.00033806147568624263 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0326_text_document -0.0002725192743462296 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0602_text_document -0.0003386020917687969 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0611_text_document -0.0002475431480184715 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0499_text_document -0.0002473766564211022 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0032_text_document -0.0002851408177452277 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0599_text_document -0.0002170924906780552 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0097_text_document -0.00032913434544042444 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0182_text_document -0.0002718817554852544 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0378_text_document -0.0003605780424180856 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0509_text_document -0.00020037461147756993 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0139_text_document -0.00028996408514680153 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0456_text_document -0.00029824911284238767 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0322_text_document -0.0002866873513416864 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0221_text_document -0.00020169453438146766 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0153_text_document -0.0002568539383574184 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0076_text_document -0.0002563470964947752 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0057_text_document -0.00028934147258318096 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0284_text_document -0.0002633358277621299 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0075_text_document -0.0003542020133336853 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0422_text_document -0.0002670040366080037 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0288_text_document -0.0002582687175685798 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0077_text_document -0.0003391102438693685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0305_text_document -0.0003008490817946171 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0273_text_document -0.0002653817061873682 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0242_text_document -0.0002448781942433207 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0050_text_document -0.0003398433124009884 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0525_text_document -0.0002860462478845397 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0232_text_document -0.00019774620740653148 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0173_text_document -0.0003435288005031101 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0294_text_document -0.00024599133263112835 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0016_text_document -0.0002717967551816393 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0365_text_document -0.00037903039437164123 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0604_text_document -0.0002088127217590141 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0138_text_document -0.00019764275005400696 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0178_text_document -0.0002824696088080159 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0377_text_document -0.00025439496382211074 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0042_text_document -0.0003406741244113085 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0065_text_document -0.00023424846114057608 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0112_text_document -0.00019313677074461676 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0142_text_document -0.0003095287449412706 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0184_text_document -0.00025333741815045563 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0495_text_document -0.0002817316346120187 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0276_text_document -0.00034078147118718575 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0301_text_document -0.0003040077750531272 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0210_text_document -0.00025716096160298353 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0494_text_document -0.00032046393022247256 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0447_text_document -0.00034897289640574213 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0333_text_document -0.0003296405548599967 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0424_text_document -0.0002740502161790299 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0224_text_document -0.00022714458123629922 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0105_text_document -0.00034363243971404173 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0081_text_document -0.0002872372090352805 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0579_text_document -0.00025581154131057356 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0536_text_document -0.00019475256915422646 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0168_text_document -0.0003692302803692929 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0293_text_document -0.00023876622604802913 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0021_text_document -0.0002882922005665277 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0230_text_document -0.00024564448893769165 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0260_text_document -0.0003225686520264833 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0450_text_document -0.0003634459652928689 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0465_text_document -0.00027645525638440404 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0394_text_document -0.00030963921241643964 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0319_text_document -0.00023334329927625922 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0028_text_document -0.00036313463407885765 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0608_text_document -0.00025564669907419214 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0538_text_document -0.00019278288308391885 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0155_text_document -0.00034000302243969134 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0517_text_document -0.0002509806072007909 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0240_text_document -0.00034220184694416697 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0515_text_document -0.00019740285264181645 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0158_text_document -0.00031502267568822904 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0209_text_document -0.00030626516902061067 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0321_text_document -0.000348986228477158 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0296_text_document -0.0002927218722236596 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0576_text_document -0.00031884703885737373 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0080_text_document -0.0002242778767677176 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0091_text_document -0.00025642014581296705 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0259_text_document -0.00024397330133089134 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0121_text_document -0.00028896335037186975 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0580_text_document -0.00021150345520434648 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0595_text_document -0.00024611075336120286 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0062_text_document -0.0003398153377945975 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0436_text_document -0.0003384490583704485 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0337_text_document -0.00032180256807125913 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0059_text_document -0.00022578733674539733 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0115_text_document -0.0003765131080436743 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0412_text_document -0.0003215742730449396 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0462_text_document -0.0002245422695597009 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0106_text_document -0.00036405219300064597 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0423_text_document -0.00028029912172422236 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0366_text_document -0.00032521630849492397 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0215_text_document -0.0002571120134876151 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0263_text_document -0.000353082459811595 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0295_text_document -0.000326822292642579 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0443_text_document -0.00030304625919441264 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0557_text_document -0.0002265130266993121 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0010_text_document -0.00019743743439428228 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0136_text_document -0.00025992560920928023 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0376_text_document -0.00025253051916689003 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0235_text_document -0.00019383475662344355 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0135_text_document -0.00019960789056665916 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0144_text_document -0.0002678902628605307 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0548_text_document -0.00025659636029182583 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0537_text_document -0.000237672163839986 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0128_text_document -0.0002508337955711484 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0287_text_document -0.00029901333173360016 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0194_text_document -0.0002514033871582251 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0069_text_document -0.00031036518034747307 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0324_text_document -0.00027152269930032273 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0364_text_document -0.0003245993299318324 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0187_text_document -0.0003302486844096023 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0096_text_document -0.0003062956939316864 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0558_text_document -0.00023694500660912133 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0063_text_document -0.0003498659421229062 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0551_text_document -0.00026632904273109524 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0286_text_document -0.00031830571948368423 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0449_text_document -0.0002620222328990345 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0255_text_document -0.00032743994990305114 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0358_text_document -0.0003938666304364765 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0383_text_document -0.00028452006167164925 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0283_text_document -0.0003322166464351345 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0470_text_document -0.000258744238720393 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0403_text_document -0.00032411432235958506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0349_text_document -0.0003296906897932579 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0524_text_document -0.0003443435331616602 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0426_text_document -0.00025288497022764084 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0486_text_document -0.00035861042515638225 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0519_text_document -0.000248397389077527 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0380_text_document -0.00026829844879360724 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0387_text_document -0.00022119227073337001 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0122_text_document -0.00036944219572927674 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0518_text_document -0.0003011917754611523 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0554_text_document -0.000321480091824964 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0027_text_document -0.0003533262482265537 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0418_text_document -0.00023730278437006437 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0104_text_document -0.00023868209081281727 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0039_text_document -0.00029588940231159373 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0268_text_document -0.00032104344076418505 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0201_text_document -0.000292230059627145 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0094_text_document -0.000323725733362109 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0347_text_document -0.00036745776649266704 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0416_text_document -0.0003409543488124674 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0514_text_document -0.0002770342357912649 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0231_text_document -0.0002337063719146717 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0330_text_document -0.00032493466645558633 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0523_text_document -0.00028683852667057725 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0570_text_document -0.0003469424073533543 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0421_text_document -0.0002355667414524093 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0009_text_document -0.00033958966888490423 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0478_text_document -0.00037452714181020425 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0417_text_document -0.00037157900358371685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0482_text_document -0.00032107564301446506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0213_text_document -0.0002779718104193777 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0568_text_document -0.0003302748963883548 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0312_text_document -0.00027108401282429457 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0550_text_document -0.00034186039289312654 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0219_text_document -0.00027051936616135047 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0093_text_document -0.00025501046585739055 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0530_text_document -0.0002946513779650046 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0035_text_document -0.0003252745507933036 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0356_text_document -0.0002657380205070845 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0406_text_document -0.0002620270293581152 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0498_text_document -0.0003653407408166962 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0297_text_document -0.00028067817330870856 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0266_text_document -0.00035628032009124333 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0428_text_document -0.0003460280889408009 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0074_text_document -0.00025565166227441784 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0244_text_document -0.00033985184011624644 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0431_text_document -0.00037036602086903427 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0410_text_document -0.00029949526237892663 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0134_text_document -0.00027881676000299146 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0246_text_document -0.00031647397707654777 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0408_text_document -0.0002967146805014503 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0563_text_document -0.00033325946969851915 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0360_text_document -0.00033990478189638713 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0468_text_document -0.0002584821322450841 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0402_text_document -0.00027278965131770946 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0370_text_document -0.00020295833140368427 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0148_text_document -0.00024279037083080438 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0092_text_document -0.00022359240324368993 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0089_text_document -0.00025058114122039534 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0251_text_document -0.00019085874667820006 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0061_text_document -0.0003549309183005571 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0609_text_document -0.0002647672434074523 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0539_text_document -0.00031752257400591305 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0475_text_document -0.0002522964974057918 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0261_text_document -0.00036196222293690685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0304_text_document -0.00031879924506906604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0203_text_document -0.00026311419200259503 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0084_text_document -0.0002685903870422415 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0395_text_document -0.00021930149045373045 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0131_text_document -0.00031525572610882754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0197_text_document -0.0003250055544565549 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0087_text_document -0.0002494706424870606 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0327_text_document -0.00027762277923554745 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0279_text_document -0.0003083430926571075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0196_text_document -0.00031048286507434094 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0555_text_document -0.00024138490994514737 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0041_text_document -0.00031653009280484387 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0317_text_document -0.00030726372172028754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0193_text_document -0.00034407639895572313 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0441_text_document -0.0003077437581952319 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0202_text_document -0.00035799065644308883 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0511_text_document -0.00030396208504619444 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0220_text_document -0.00031563469263051037 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0474_text_document -0.0002904537122835995 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0577_text_document -0.00032010237765861207 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0014_text_document -0.00019481860586783526 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0171_text_document -0.0003394041823348506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0361_text_document -0.0002558062319927343 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0264_text_document -0.0002513457601549774 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0024_text_document -0.00034069289937398433 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0432_text_document -0.00032158521162506154 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0451_text_document -0.0003039551974410624 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0552_text_document -0.0002904199116648874 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0584_text_document -0.0002538664480925548 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0239_text_document -0.00029585659356578213 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0458_text_document -0.00027200428224862015 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0407_text_document -0.0003061108912685211 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0045_text_document -0.0002685534203724513 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0541_text_document -0.0002687083874265679 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0371_text_document -0.0003239815149554464 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0480_text_document -0.00028480018183138863 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0590_text_document -0.00023635788418747915 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0015_text_document -0.00023164951700334075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0108_text_document -0.00031251045815569193 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0320_text_document -0.00019130373682690652 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0145_text_document -0.0004018105513267898 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0483_text_document -0.00033404230628775514 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0521_text_document -0.0003629045692047148 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0419_text_document -0.00019355538307594888 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0150_text_document -0.00031931294475357857 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0526_text_document -0.0002819143043874387 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0589_text_document -0.00022469124701918232 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0119_text_document -0.00031109478995926487 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0315_text_document -0.00026688680630152287 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0546_text_document -0.00035710114951904826 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0510_text_document -0.0002892627585786743 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0373_text_document -0.00036982120060819184 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0413_text_document -0.00025789399110047885 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0249_text_document -0.00025787093140932716 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0484_text_document -0.00039747640249024 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0493_text_document -0.0002517985792404221 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0491_text_document -0.0002652211441668472 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0385_text_document -0.0002316547001935751 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0005_text_document -0.000290714543042489 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0374_text_document -0.0003159889683761466 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0185_text_document -0.0003369343053203455 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0345_text_document -0.000288343878814311 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0571_text_document -0.00018838166008344263 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0167_text_document -0.0002714687254850031 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0234_text_document -0.00031494166859812457 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0318_text_document -0.0003268206155290055 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0520_text_document -0.00025580125931601587 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0256_text_document -0.00023823591542563343 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0116_text_document -0.00030252744513705306 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0088_text_document -0.0003043223524171429 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0556_text_document -0.00036254935821950336 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0302_text_document -0.0002464195089998822 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0238_text_document -0.00030693311532016543 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0205_text_document -0.00023952239736627944 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0019_text_document -0.0003115236813519545 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0191_text_document -0.00031249741556856433 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0199_text_document -0.00026528939183645053 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0078_text_document -0.0002439509776473377 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0594_text_document -0.00032623079253258036 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0195_text_document -0.00024309468267130917 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0030_text_document -0.00034505079788060524 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0439_text_document -0.000313161148438969 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0448_text_document -0.00033674896385088213 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0350_text_document -0.00027937103577285136 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0267_text_document -0.00027990316836378013 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0275_text_document -0.0003240373233746227 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0348_text_document -0.0002977493763185773 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0560_text_document -0.00019744731070470085 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0181_text_document -0.00024582577153508796 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0329_text_document -0.00034198380503166554 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0516_text_document -0.0003267589630470458 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0363_text_document -0.0002489019202553718 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0258_text_document -0.00032334233658009833 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0359_text_document -0.00035987415471499623 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0299_text_document -0.00032683611343026025 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0457_text_document -0.00025921593001537887 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0379_text_document -0.00022476727556616815 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0049_text_document -0.00027380222381141376 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0368_text_document -0.00025439005748084117 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0265_text_document -0.00023975192095218565 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0046_text_document -0.0003189906411148287 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0311_text_document -0.00020110221297543672 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0177_text_document -0.00024285538318068062 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0058_text_document -0.00023779525016768145 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0040_text_document -0.0002550856807797905 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0549_text_document -0.0003769436968567075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0605_text_document -0.00021456324230875702 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0160_text_document -0.00033854449229250405 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0472_text_document -0.0002487883467469115 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0020_text_document -0.00034510827416249317 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0553_text_document -0.0003135334924716841 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0211_text_document -0.00024364851124096691 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0052_text_document -0.0003541654171496327 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0466_text_document -0.00025475723682504567 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0382_text_document -0.00032694655879361305 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0351_text_document -0.0003713644632352606 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0433_text_document -0.0002797686093879111 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0270_text_document -0.0002759556053746586 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0593_text_document -0.0002669498434795677 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0529_text_document -0.0002821471576067372 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0095_text_document -0.0003133308099231397 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0308_text_document -0.00020949279074901415 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0152_text_document -0.00036351719828329717 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0064_text_document -0.00030678901532121404 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0189_text_document -0.00031709424150066075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0527_text_document -0.00024417776749073654 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0070_text_document -0.0002680981881658349 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0400_text_document -0.0002593798629597245 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0257_text_document -0.00027539122337661716 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0229_text_document -0.00019362359938287403 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0154_text_document -0.0003372318177773526 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0362_text_document -0.0002841575840215986 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0573_text_document -0.00019780262502880867 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0161_text_document -0.00025623938780712984 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0252_text_document -0.00036574619481231154 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0386_text_document -0.00028409153738023557 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0280_text_document -0.00028235078893270346 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0585_text_document -0.0003591167512270668 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0464_text_document -0.0003469074251062574 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0338_text_document -0.0002969586965778641 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0278_text_document -0.00037523287720373535 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0506_text_document -0.0002449994567974368 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0033_text_document -0.0001996403556767957 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0137_text_document -0.00033148641160045666 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0444_text_document -0.0002586467098165628 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0247_text_document -0.00023073263565077901 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0109_text_document -0.0002441039887082391 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0004_text_document -0.00023271649855413572 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0011_text_document -0.0003741176027693515 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0508_text_document -0.0002198975659046473 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0126_text_document -0.00023134591341784835 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0017_text_document -0.0002488855690185143 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0254_text_document -0.0002872080458059506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0567_text_document -0.00023614036077450578 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0100_text_document -0.0002675740169569111 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0398_text_document -0.00022979888140300082 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0117_text_document -0.00020928513439559754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0147_text_document -0.00021373565320847123 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0176_text_document -0.00019546090607062898 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0156_text_document -0.0002686258383757261 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0490_text_document -0.00025788665245942143 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0022_text_document -0.0003171090744680125 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0190_text_document -0.00031782937252322326 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0047_text_document -0.00031802026676613546 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0207_text_document -0.00031662885342709164 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0446_text_document -0.00027908551130471514 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0227_text_document -0.00033733035254319147 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0435_text_document -0.00027399448876130287 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0289_text_document -0.00019694719038293703 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0146_text_document -0.0002699463758978633 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0598_text_document -0.00036803059610704023 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0503_text_document -0.00019238917001147227 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0165_text_document -0.0003032499694911512 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0208_text_document -0.0002324778915243651 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0025_text_document -0.00034461966416924293 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0442_text_document -0.00029845740389518305 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0325_text_document -0.0003573488582687763 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0429_text_document -0.00024133252840622868 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0056_text_document -0.00031262504271566886 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0212_text_document -0.00023948892639413417 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0002_text_document -0.0002463627682743308 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0245_text_document -0.0003131758857388708 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0068_text_document -0.0002694331967435558 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0381_text_document -0.0003258932251973233 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0452_text_document -0.0002642164231729589 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0500_text_document -0.0002714159542817306 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0396_text_document -0.00024335790850090816 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0328_text_document -0.0002516843829910143 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0384_text_document -0.00026719508044412176 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0389_text_document -0.00028487275709931573 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0586_text_document -0.00025749821847028987 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0488_text_document -0.0003655946095122969 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0298_text_document -0.00031665029873002615 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0463_text_document -0.00024262656469095593 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0118_text_document -0.00034363087609274955 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0479_text_document -0.0003008713605776459 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0274_text_document -0.000299235651831285 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0272_text_document -0.0003745736454417228 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0492_text_document -0.00025495685914334683 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0113_text_document -0.00036183034603409704 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0415_text_document -0.0002652065517219049 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0405_text_document -0.00030825924668055317 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0198_text_document -0.0003007499820493098 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0313_text_document -0.0003553883704883992 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0331_text_document -0.0003173906831139949 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0314_text_document -0.00025410681301171756 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0053_text_document -0.0003848071363766293 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0507_text_document -0.0003368315781090189 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0438_text_document -0.0002453724684716242 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0029_text_document -0.0002648420840083915 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0399_text_document -0.00029480381163674485 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0564_text_document -0.0002816700525268618 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0601_text_document -0.000255851381162692 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0241_text_document -0.0003559575342795207 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0343_text_document -0.00028596502870150385 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0223_text_document -0.0003019283845909421 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0316_text_document -0.0002310727115490268 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0123_text_document -0.0002650814902223781 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0535_text_document -0.0002553585760163846 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0250_text_document -0.0003356609205694754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0335_text_document -0.0002547220370698787 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0253_text_document -0.00025944708152734894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0079_text_document -0.00030116112758130255 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0565_text_document -0.0003230808526201062 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0192_text_document -0.00024257541419356256 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0098_text_document -0.00019567998656160198 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0247_text_document -0.0001695341620763688 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1166_text_document -0.0001624146809319408 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1192_text_document -0.00014333841136634416 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0818_text_document -0.00012708025826068252 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0166_text_document -0.00016352699396236217 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0529_text_document -0.00021728611911473635 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1356_text_document -0.00015636251730253634 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0858_text_document -0.00014324586822581748 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0823_text_document -0.00015897461323219583 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1339_text_document -0.00017108674190078807 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0627_text_document -0.00015013827861652795 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1370_text_document -0.0001972006042068891 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0341_text_document -0.00012269303700377157 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0185_text_document -0.00014744939263600657 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1259_text_document -0.0001618181386509441 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0981_text_document -0.00016515493943054124 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0515_text_document -0.00015677010982666643 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0545_text_document -0.00017095114976158616 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1167_text_document -0.00014364160317566655 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0077_text_document -0.00018828479047274606 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0665_text_document -0.00015465902478345193 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1195_text_document -0.0001390587027239324 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0085_text_document -0.00018195456827202163 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0566_text_document -0.00011920421004161942 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0230_text_document -0.00021205550061396497 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0311_text_document -0.0001436159963386609 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0055_text_document -0.00014247480527621313 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0810_text_document -0.00014589130200546316 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1270_text_document -0.0001657398838382825 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0966_text_document -0.00016088126731396651 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0517_text_document -0.00016628319575737334 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0843_text_document -0.00020520743975773465 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0348_text_document -0.00014665699381439506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0797_text_document -0.00015884407950940795 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0736_text_document -0.00017445628170628633 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0943_text_document -0.00015491157489278548 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1054_text_document -0.00015361878566936873 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1105_text_document -0.00018148660349496672 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0556_text_document -0.00016318924883005205 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0849_text_document -0.00017735550904648365 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0492_text_document -0.00015506323262027988 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1121_text_document -0.00014514138359296612 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0817_text_document -0.00016129591328185056 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0874_text_document -0.00017893089540432167 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0579_text_document -0.00015469510066293024 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1250_text_document -0.00014147217452585304 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0146_text_document -0.00018002538811588936 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0589_text_document -0.00012896971047756227 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0169_text_document -0.0001493805247240001 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1084_text_document -0.0002033040613836003 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1041_text_document -0.00015483383064936307 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0526_text_document -0.00018457049518666235 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0551_text_document -0.00012464750991886638 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0193_text_document -0.00020263121348102227 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1402_text_document -0.00015732724749731566 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1106_text_document -0.00014994880426810227 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0782_text_document -0.0001427354007022011 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0659_text_document -0.00017383550442341477 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0588_text_document -0.00014794741120707251 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0990_text_document -0.00013965112476129038 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0833_text_document -0.00016830993685690627 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0845_text_document -0.0001578459852914722 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1098_text_document -0.00016544832622877084 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0402_text_document -0.00016619317471768417 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0878_text_document -0.0001705972129133786 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0930_text_document -0.00013964228274235854 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0046_text_document -0.00016071569403386412 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0440_text_document -0.0001652980777911164 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1293_text_document -0.00017149553387813454 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0393_text_document -0.0001440150819055646 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0049_text_document -0.0001596228732463621 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0305_text_document -0.00016054511332990351 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0868_text_document -0.00015086161253385788 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1126_text_document -0.00016278072512770076 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0531_text_document -0.0001460824476010622 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1087_text_document -6.535045240771344e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1442_text_document -0.00014918602129116153 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0997_text_document -0.00016976775852345032 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0366_text_document -0.00012941294747512296 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0165_text_document -0.00021543528225174234 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1078_text_document -0.00016961153312648427 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0957_text_document -0.00014846894027958484 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1002_text_document -0.00016792533024266346 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0269_text_document -0.00015032152307353972 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0460_text_document -0.00019090043692832962 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1397_text_document -0.00017126164591608773 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0250_text_document -0.00017052271730189918 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0951_text_document -0.0001529819080510649 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1246_text_document -0.0001593904679041581 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0876_text_document -0.0001535686933803246 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0302_text_document -0.0001828537136916407 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0564_text_document -0.00018216559554926296 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0584_text_document -0.0001518375491324927 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0622_text_document -0.00017989122128631265 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0694_text_document -0.0001937399691335672 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0335_text_document -0.00016284056669703547 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1189_text_document -0.0001274714871382623 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0215_text_document -0.0002039840022153184 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1390_text_document -0.00013201968828389528 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0204_text_document -0.0002118653491475742 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1038_text_document -0.00016037864519813518 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0713_text_document -0.0001810568673406439 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0567_text_document -0.0001520426436740549 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1130_text_document -0.0001363916939036548 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0221_text_document -0.00016600137546972103 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0538_text_document -0.000157091798621546 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1232_text_document -0.00015120338743186473 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1265_text_document -0.00014339602031912966 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0628_text_document -0.00013718423915964128 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0090_text_document -0.00016701812935300498 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0968_text_document -0.00017269235601396706 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0248_text_document -0.00015271729639722688 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0885_text_document -0.00016313458661522843 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0977_text_document -0.0001547258823418314 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0749_text_document -0.00016083316892410838 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0527_text_document -0.0001583053763245904 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0985_text_document -0.0001733472209179004 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0934_text_document -0.00014798010488118723 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0993_text_document -0.00013889389681410628 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0088_text_document -0.00019105504252193975 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0674_text_document -0.0001260784262432769 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0171_text_document -0.00016818286634093132 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1160_text_document -0.00014580962546884019 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0640_text_document -6.470029145424178e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1419_text_document -0.0001681159453803032 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0488_text_document -0.00017801942490344936 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0704_text_document -0.00015673746308375355 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0887_text_document -0.0001539147990354861 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1055_text_document -0.0001456873195980062 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1258_text_document -0.00017094160309773435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0924_text_document -0.00016796911953592162 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0390_text_document -0.000151517525944017 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0612_text_document -0.00014541128245282718 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1276_text_document -0.00015875350296674313 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0744_text_document -0.00016829864426209354 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0399_text_document -0.00013812489725796757 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0053_text_document -0.00014239921197099827 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0025_text_document -0.0001675238599840661 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0371_text_document -0.0001302247584407276 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0161_text_document -6.682470879450296e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1444_text_document -0.00014844153787284714 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0051_text_document -0.00017069965470177095 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0367_text_document -0.00013975554284966803 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0036_text_document -0.00020383835732242943 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1398_text_document -0.00016109580427796109 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0434_text_document -0.00016807102499995444 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0965_text_document -0.0001581069814377221 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1226_text_document -0.00014313504064392658 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0093_text_document -0.00017857519389779866 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0702_text_document -0.00016332263700781356 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0949_text_document -0.00015031787169065788 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1343_text_document -0.0001814913146656133 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1480_text_document -0.00017181771454811032 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0503_text_document -0.00016211098122773695 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0979_text_document -0.00014013915195024086 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1039_text_document -0.00014875652926934062 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1034_text_document -0.00015743134722066542 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0725_text_document -0.00017191287320061663 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1185_text_document -0.00017494945169756188 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1288_text_document -0.0001555344866428178 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0742_text_document -0.00013537753432305735 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0242_text_document -0.00014710662157811443 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1071_text_document -0.00016411831898287708 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0975_text_document -0.00016201729549295002 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1211_text_document -0.00015221296332320523 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0435_text_document -0.00015852377744129056 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0841_text_document -0.0001637600734893311 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1303_text_document -0.00020336175111220435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1380_text_document -0.00018339831779819768 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0558_text_document -0.00015777501854536213 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0522_text_document -0.00013195539998318593 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0181_text_document -0.0001386037527814194 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0098_text_document -0.00020002678200326375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0318_text_document -0.0001502810207694568 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1101_text_document -0.00016701450707649348 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1183_text_document -0.00014537187039666396 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0054_text_document -0.00016542204453010793 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0962_text_document -0.00016147214242670993 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0072_text_document -0.0001666473715046599 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1283_text_document -0.00014610222865137749 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1269_text_document -0.00017003850913949867 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0379_text_document -0.00013929800966037666 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0109_text_document -0.00015355042744068418 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1257_text_document -0.00018749589298954898 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0683_text_document -0.00018128248525778134 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0932_text_document -0.00015847515536239644 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0724_text_document -0.0001675613909564323 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0259_text_document -0.00015830816489537683 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0752_text_document -0.0001559684384825985 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0748_text_document -6.807932805918992e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1456_text_document -0.00014366963847673678 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0038_text_document -0.00016516766636021026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1311_text_document -0.00013191015853023994 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0224_text_document -0.00016218733445741242 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1181_text_document -0.00016199692216184222 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0292_text_document -0.0001498689357109041 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0006_text_document -0.00021774546028345258 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1077_text_document -0.0002018860545241583 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1376_text_document -0.00013948006210998777 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0082_text_document -0.0001555107547975781 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1107_text_document -0.00016590320374380407 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1305_text_document -0.00015357096232342907 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1263_text_document -0.00017149756229882957 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1484_text_document -0.00014945745805040604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0470_text_document -0.0001370605666431844 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0232_text_document -0.00020871806603751215 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0312_text_document -0.00017949882612324094 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1478_text_document -0.00012967198288991124 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0176_text_document -0.00014407639684388027 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0825_text_document -0.0001738289893346974 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0587_text_document -0.00016432007712212363 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0726_text_document -0.00021987744618253408 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1069_text_document -0.0001658824793312056 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0478_text_document -0.00018343459381217617 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1470_text_document -0.0001540243960271133 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0751_text_document -0.00014076985382851758 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0034_text_document -0.00018327802049255027 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0557_text_document -0.00013541979587031706 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0087_text_document -0.00018283903212254103 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0554_text_document -0.00018356592152967213 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0680_text_document -0.00013528981629672019 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0095_text_document -0.00016195981108835402 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1193_text_document -0.0001692203965318927 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1290_text_document -0.00016769038226722118 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0956_text_document -6.577591782266853e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1433_text_document -0.00020548459039080355 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1409_text_document -0.00016625397566908732 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0546_text_document -0.0002191514358329144 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1137_text_document -0.00022811276854586046 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1042_text_document -0.00021400238398657707 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1361_text_document -0.00016277096569402517 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0983_text_document -0.0001583899704928049 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1099_text_document -0.0001880639175708719 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0675_text_document -0.00015133651117035432 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0000_text_document -0.0001625341698025103 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0540_text_document -0.00017771535471350786 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0593_text_document -0.00016815517512679766 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1182_text_document -0.00016079574219316162 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0984_text_document -0.00016439910543030416 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0463_text_document -0.00016528739620445078 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0406_text_document -0.00016489000174022887 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0963_text_document -0.0001557518593344314 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1112_text_document -0.00017657136921387344 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0572_text_document -0.00017613819918473885 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0894_text_document -0.00016818136837819556 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0901_text_document -0.00015958987474506617 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0539_text_document -0.00018218170919691117 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1462_text_document -0.00016718904263673248 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0504_text_document -0.00015987218216956836 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1196_text_document -0.00017189442585383062 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0916_text_document -0.0001634813294517073 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0840_text_document -0.0002205095381720346 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1059_text_document -0.00017396541314894736 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0895_text_document -6.507565239609069e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1439_text_document -0.00015962194770891035 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0521_text_document -0.00012643543528015894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0227_text_document -0.00012965619956572215 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0157_text_document -0.00013223758759774493 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0134_text_document -0.00014136760030097697 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0091_text_document -0.0001381774119190453 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0080_text_document -0.00019798512467862197 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1405_text_document -0.00016156745631319154 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0856_text_document -0.00019974862821575546 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0355_text_document -0.00018200165470784005 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0904_text_document -0.00015113105990653198 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1256_text_document -0.00015548753626235857 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0886_text_document -0.00017554167495420438 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0703_text_document -0.00015653678368525705 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0870_text_document -0.00016896257320564437 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1307_text_document -0.0001903984601165236 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1463_text_document -0.00014624534535139798 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1278_text_document -0.00015877224250538676 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1176_text_document -0.00015006091298155116 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0999_text_document -0.00019757096381691082 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0352_text_document -0.00016078369621718087 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1216_text_document -0.00016656194994838216 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0506_text_document -0.00016849271470946895 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0942_text_document -0.00012907462743559026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0164_text_document -0.00014130711072004757 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0071_text_document -0.00019625342379966053 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0362_text_document -0.0001551313555160629 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0285_text_document -0.00014941428518043363 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0819_text_document -6.68063248499656e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1447_text_document -0.0001529851241921758 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0794_text_document -0.00015413689147171587 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1111_text_document -0.00015819723420022034 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0452_text_document -0.0001421845421104754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1008_text_document -0.0002337347761220641 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1064_text_document -0.00015326757579474523 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0786_text_document -0.00016886422097510493 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0275_text_document -0.00014601039985789424 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1004_text_document -0.00014884931563984607 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1020_text_document -0.00015986533743428418 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1198_text_document -0.00016926153082778508 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0258_text_document -0.00012837862440004137 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0179_text_document -0.00015414960057655343 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0473_text_document -0.00017497436201235553 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0706_text_document -0.0001432922811537633 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0829_text_document -0.00021642532731730042 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1348_text_document -0.00016753124352564838 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0489_text_document -0.00015608314375504165 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0721_text_document -6.782620642709544e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1418_text_document -0.00014090986584903154 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0657_text_document -0.00012660624973760565 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0182_text_document -0.00014735197135805754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1268_text_document -0.00018616875148016776 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0677_text_document -0.0001272812571523608 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0211_text_document -0.00015478061878918103 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1199_text_document -0.00017175987703134661 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1161_text_document -0.00016736774334113506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1159_text_document -0.00021542375861101283 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1060_text_document -0.000145172522445389 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0203_text_document -0.00015175519683168932 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0634_text_document -0.00014496879545197283 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0214_text_document -0.00017092840392284674 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1292_text_document -0.0001278331608658506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0168_text_document -0.00015032268010816978 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1342_text_document -0.0001385868171608466 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0105_text_document -0.00015134071512301172 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1346_text_document -0.0001269932638322507 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0145_text_document -0.00013140246671645566 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0174_text_document -0.00020348737894498417 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1393_text_document -0.00016800561035040085 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1295_text_document -0.00016164160174570342 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0888_text_document -0.00018931294863807786 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0673_text_document -0.00019734558642218287 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1412_text_document -0.0002303194107055354 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1337_text_document -0.0002167143077720647 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1089_text_document -0.0002143413186443493 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1031_text_document -0.00015854607653108938 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0542_text_document -0.00013590494333364677 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0135_text_document -0.0001709192279703633 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0604_text_document -0.0001684909541125075 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0905_text_document -0.00014641981954006535 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1028_text_document -0.00015196906818488852 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0760_text_document -0.00015046293445613942 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0052_text_document -0.00021207499511319207 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0361_text_document -0.000170520846597118 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0408_text_document -0.00016506154746702737 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0514_text_document -0.00022220671190117854 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1144_text_document -0.00013485250436339217 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0086_text_document -0.00014243329417692134 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0016_text_document -0.0001997902496484977 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0363_text_document -0.00013773786894858352 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0026_text_document -0.00014210492964421037 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0013_text_document -0.00014261494951636302 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0048_text_document -0.0001994698002434822 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0343_text_document -0.0001447668168714561 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0991_text_document -0.00015128160843312126 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1151_text_document -0.0002074354130511704 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0325_text_document -0.00012703221289405721 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0209_text_document -0.00016873053428782402 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1325_text_document -0.00014384384709797832 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0042_text_document -0.00014223883045509972 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0162_text_document -0.00017796337347992502 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0902_text_document -0.0001491404477097102 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0625_text_document -0.00016202535163988179 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0482_text_document -0.00016604798605022845 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0502_text_document -0.00012837092768293909 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0225_text_document -0.0001660767481080349 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1190_text_document -0.00017106130812258926 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0498_text_document -0.0001266757182953492 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0198_text_document -0.00015520576268027733 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1146_text_document -0.00016248205157470968 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1197_text_document -0.00016174543116338102 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0889_text_document -0.0001607107134600685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0873_text_document -0.000142616278481646 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1013_text_document -0.00015710288183099663 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1212_text_document -0.0001393562142784163 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0107_text_document -0.00014454298256561947 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0005_text_document -0.00016005107736770166 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0528_text_document -0.000135126504062645 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0101_text_document -0.0001508209689849079 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0792_text_document -0.0001451914251150852 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0638_text_document -0.00015254108755913753 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1253_text_document -6.510273756595438e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1449_text_document -0.00016128557059261363 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0761_text_document -0.0001664509064378366 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1330_text_document -0.0001645218431584474 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0268_text_document -0.0001644207050646909 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0372_text_document -0.00015724536834425392 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0757_text_document -0.0001423864254748038 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0031_text_document -0.00014675658024529065 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0815_text_document -0.0001383859658316972 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0074_text_document -0.0001537984956202417 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0771_text_document -0.0001689143410167348 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0936_text_document -0.00021971795320063967 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1355_text_document -0.0001615226814646947 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0764_text_document -0.00016998404232092888 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0516_text_document -6.971484231182006e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1460_text_document -0.000165053699351673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0394_text_document -0.0001284057718439998 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0178_text_document -0.0001469131892145795 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1005_text_document -0.00015600815153021962 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1103_text_document -0.00017596769632339667 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0899_text_document -0.0001629273325614891 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1204_text_document -0.00014895594062286423 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1061_text_document -0.00021388747632332592 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0327_text_document -0.0001272010357086257 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0233_text_document -0.00013763895692808363 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0057_text_document -0.00017734499397533223 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0922_text_document -0.00014915400067365785 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0996_text_document -6.676606803216924e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1457_text_document -0.0001617530978715898 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0513_text_document -0.00016745873391627768 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1323_text_document -0.00016334347288201646 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0426_text_document -6.651419662021617e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1428_text_document -0.00015587845196441837 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0879_text_document -0.00016753160949877044 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0297_text_document -0.00021020002387965022 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1395_text_document -0.0002065001249653627 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0333_text_document -0.00012663980960827745 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0202_text_document -0.00016757511900986375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1171_text_document -0.00015605009719439 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0746_text_document -0.00015715662663107652 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0111_text_document -6.636371600849109e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1459_text_document -0.00014498611080914514 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0806_text_document -0.0001542272198205248 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1332_text_document -0.00014059323118358123 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0003_text_document -0.00021921136810582192 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1082_text_document -0.00015167674681519302 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0745_text_document -0.00016013437912281925 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1187_text_document -0.0001439794004286864 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0803_text_document -0.00020770381846091124 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0326_text_document -0.0001522225828657984 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1131_text_document -0.0001841469798985223 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0678_text_document -6.985047980280837e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1458_text_document -0.0001946928000123715 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1465_text_document -0.00019167154590661272 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0662_text_document -0.0001579237297605254 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0715_text_document -0.00014555179650182237 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0043_text_document -0.00014278974929024318 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1029_text_document -0.00014073888645548604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0108_text_document -0.00015833078302709586 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0450_text_document -0.00015175492630038637 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0465_text_document -0.00013753021891282864 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0125_text_document -0.0001507527795280453 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1052_text_document -0.00014715051646657675 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0988_text_document -0.00016685078464566375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0487_text_document -0.00016411180976055992 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0872_text_document -0.00014019467269017514 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0096_text_document -0.00016270832291858043 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0865_text_document -0.00014389487056524366 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0643_text_document -0.00016448630021886695 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0507_text_document -0.0001428288066548232 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0032_text_document -0.00015068597679794492 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1228_text_document -0.00015437630829034905 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1076_text_document -0.00015394531108560747 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1229_text_document -0.00012456910277221792 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0219_text_document -0.00022499754937441147 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1341_text_document -0.00016328521704579013 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0384_text_document -0.00017564631641705234 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0583_text_document -0.00013090562187669734 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0236_text_document -0.00014405833194126315 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1267_text_document -0.00018000171719095975 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0682_text_document -0.00016551054323893732 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1324_text_document -0.00018462225150269493 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0672_text_document -0.00014938124380928987 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1237_text_document -0.00016278567817454143 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0256_text_document -0.0001801705673068524 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0689_text_document -0.0001447484970060597 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0801_text_document -0.0002061767466472168 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1392_text_document -0.00014733469571190217 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0630_text_document -0.00015454726196198582 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0041_text_document -0.00021817938865763232 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1027_text_document -0.0001298786214228879 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0173_text_document -0.00018138691914031344 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1476_text_document -6.537426029052535e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1455_text_document -0.00014073700672547374 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0172_text_document -0.0001256698111604605 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0228_text_document -0.00014477850589954592 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0812_text_document -0.00016291893052257454 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0980_text_document -0.00015081447600800676 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0781_text_document -0.00014969267700841283 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1280_text_document -0.00012692990964296264 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0141_text_document -0.0001509226797295792 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1046_text_document -0.00014673420805111974 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0656_text_document -0.00013712771887536008 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0137_text_document -0.00012288465935720468 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0234_text_document -0.00017494555279771646 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0909_text_document -0.0001638807582030245 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0410_text_document -0.0001597837545344341 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0836_text_document -0.00016533452934584025 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1313_text_document -0.00016323215726075254 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1326_text_document -0.00015298318038302255 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1113_text_document -0.00020767747087806112 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1040_text_document -0.00021672481833060058 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1345_text_document -3.750780482356549e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1492_text_document -0.00014608217797228235 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0831_text_document -0.00016476762411880743 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0293_text_document -0.00021699554593230955 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1063_text_document -0.00021105380532881085 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0309_text_document -0.0002047160064465581 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0353_text_document -0.00017221227712043642 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0597_text_document -0.00014407040837739895 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0011_text_document -0.00012338500621226977 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0189_text_document -0.00017444094648482255 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1174_text_document -0.00015546253659777677 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0474_text_document -0.00014381681585387058 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1079_text_document -0.00013897398671509773 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0066_text_document -0.0001775141529797601 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0697_text_document -0.0001591753095530007 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0734_text_document -0.00015104252960939366 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0756_text_document -0.00017569007412200155 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1488_text_document -0.00014319238402928628 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0238_text_document -0.00012505086780455324 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0235_text_document -0.0001612298998082119 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0274_text_document -0.0001838767026558464 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0417_text_document -0.0002050680150541361 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0316_text_document -6.618274004332223e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1451_text_document -0.00016022951634040166 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0864_text_document -0.00013274827939835476 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0127_text_document -0.00017726136663500188 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0900_text_document -0.0001245682131100599 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0244_text_document -0.00015882962379671717 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0291_text_document -0.0001617639209287533 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1205_text_document -0.00022875188600089843 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1344_text_document -0.00022135260148234352 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1351_text_document -0.0001486995466675951 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0441_text_document -0.00012704459393174345 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0213_text_document -0.00022842395422801987 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1143_text_document -0.0001635885246037017 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1320_text_document -0.0001582665195457721 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0437_text_document -0.0001775072258892419 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0927_text_document -0.00020242381998746212 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0324_text_document -0.0001545652142748447 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1125_text_document -6.81810618182006e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1421_text_document -0.0001597601607752306 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0735_text_document -0.0001852365073791873 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0573_text_document -0.00014051207788473263 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0002_text_document -0.00019277743184432014 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1416_text_document -0.00014805940507445537 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0776_text_document -7.404378291311911e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1430_text_document -0.00016241379871559847 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1322_text_document -0.000128787783253033 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0180_text_document -0.00016198778749979117 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0272_text_document -0.00017479293523689206 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0896_text_document -0.00016144158141470971 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0543_text_document -0.00017984364568296736 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0570_text_document -0.00016364824117358535 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0859_text_document -0.00015835449907818108 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0255_text_document -0.0001598217873096508 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0263_text_document -0.00016255842781872452 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1299_text_document -0.00014000754828630328 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0835_text_document -0.00014040163951593393 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0039_text_document -0.0001553155586782508 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0611_text_document -0.00016963744253436334 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0369_text_document -0.00015530821304263922 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0732_text_document -0.00017145228136049745 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1485_text_document -0.00015677612604573347 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0045_text_document -0.0002281728955381295 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1138_text_document -0.0001438605015826016 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0067_text_document -0.00014531584582320225 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0642_text_document -0.00016441162523091893 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0961_text_document -0.00017332479727991208 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0249_text_document -0.00015197205734143667 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1221_text_document -0.0001731814495339748 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0496_text_document -0.00016262394941726692 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1312_text_document -0.00017293102712287865 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0599_text_document -0.0001666866506642948 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0497_text_document -0.00019375975153766717 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1413_text_document -0.00019848025607825147 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1382_text_document -0.00019589076429120272 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0661_text_document -0.00014782024524927844 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1251_text_document -0.00017741559068696627 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0560_text_document -0.00021151104610061383 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0350_text_document -0.00013945420098709522 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0018_text_document -0.00019887995129951757 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0354_text_document -0.00015547221818883392 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0007_text_document -0.00017258790250155727 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0594_text_document -0.00014928346919354236 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1003_text_document -0.00020395282974390957 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1375_text_document -0.00016193806231202298 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0298_text_document -0.00012846951975899564 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0199_text_document -0.00018501450203643792 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0555_text_document -0.0001666431623624946 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0419_text_document -0.00015879678007813504 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0400_text_document -0.0001532191493377357 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0608_text_document -0.0001525105010508594 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0789_text_document -0.00015180606799817945 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0618_text_document -0.00012173158477293636 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0212_text_document -0.00017380279337573016 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0493_text_document -0.00014661984145047447 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0633_text_document -0.00017498764263995404 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0920_text_document -0.00014684899748384315 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0621_text_document -0.00013467484359124581 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0129_text_document -0.00020840519038082484 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1033_text_document -0.0001598348481728714 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0982_text_document -0.0001579504038311114 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0300_text_document -0.00014348890732096214 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1000_text_document -0.00016758977485705474 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1284_text_document -0.00012564178714092916 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0241_text_document -0.0001291169546414064 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0163_text_document -0.00016162437122570363 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0971_text_document -0.000167053672743847 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0455_text_document -0.000153169062689461 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1124_text_document -0.00018220285305712615 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1464_text_document -0.00020403943721555701 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0332_text_document -0.00015665194631128744 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1225_text_document -0.00012614521656368453 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0160_text_document -0.00014013351682155804 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0132_text_document -0.00017097003189888156 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0953_text_document -0.00020205923223837476 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1396_text_document -0.00021572515124051073 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0319_text_document -0.00014717040017889609 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0014_text_document -0.0001590281915167615 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0867_text_document -0.00015492644283290785 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0765_text_document -0.00016331970448459069 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0731_text_document -0.00013421158682600656 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0121_text_document -0.00015029220415897293 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1340_text_document -0.00018799263088031847 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1414_text_document -0.00015276640696675555 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1149_text_document -0.00017523242315344403 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0897_text_document -0.00015019996810193524 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1140_text_document -0.00018958357448030594 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1406_text_document -0.00017626847839600905 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1491_text_document -0.00014560654964584956 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0811_text_document -0.00014687591072305394 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0047_text_document -0.00017120995999685788 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0907_text_document -0.00012358194307305004 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0184_text_document -0.0001669042859901341 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1202_text_document -0.00017927650173857129 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0595_text_document -0.00022615351090448548 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1333_text_document -0.00013263308292770764 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0149_text_document -0.00015078826876750059 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1234_text_document -0.00018088433380229892 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1289_text_document -0.0002049892875117827 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0331_text_document -0.0001888977519054645 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1415_text_document -0.00015011406347016157 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0769_text_document -0.0001677857169117344 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0935_text_document -0.00012791852714258775 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0216_text_document -0.00016505191077649506 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0940_text_document -0.0001571941770076897 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0762_text_document -0.00015430477295467815 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0445_text_document -0.00016414354518076008 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0378_text_document -0.00016197041415550887 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0280_text_document -0.00016713884836257664 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0376_text_document -0.0001566985564405333 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1047_text_document -0.00014945756703007185 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1045_text_document -0.00022406061576333915 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1056_text_document -0.00021167575070222058 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0356_text_document -0.00013677667461372767 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0210_text_document -0.00020680395688920254 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1386_text_document -0.00015760638526064722 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1110_text_document -0.00015453551359300138 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0476_text_document -0.00016397827492279263 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1163_text_document -0.00017322755740910864 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0598_text_document -0.00016647054827396185 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0511_text_document -0.00015954468834603007 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0279_text_document -0.00015699856956345393 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1254_text_document -0.00014753457389550566 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0115_text_document -0.00016888120307561802 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0365_text_document -0.00016542923944435972 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1298_text_document -0.0001646539804391752 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0839_text_document -0.0001738747195039087 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1227_text_document -0.00016771993042265853 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1282_text_document -0.00014067444517710913 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0030_text_document -0.00016285517606428614 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0254_text_document -0.00014496445138260225 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0658_text_document -0.00016065034278655918 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0978_text_document -0.00016296100118619596 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0851_text_document -0.00013435857715522536 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0130_text_document -0.00020346664054327743 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0357_text_document -0.00012340782876547375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0152_text_document -0.00016711343759871717 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0952_text_document -0.00014642480417521495 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0834_text_document -6.437624301268493e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1436_text_document -0.00016874334334709629 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1302_text_document -0.00016096287210000209 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1210_text_document -6.894858552341982e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1445_text_document -0.00016183045290192 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1328_text_document -0.0001266295474692283 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0188_text_document -0.00015582561352317403 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1152_text_document -0.00020064552251710147 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0340_text_document -0.0001669433724610845 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0534_text_document -0.00015678773700557408 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0986_text_document -0.00016617707551040578 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0892_text_document -0.00014355808368684807 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0062_text_document -0.00016776954911269944 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1173_text_document -0.00013215274478441298 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0009_text_document -0.00016045529919363785 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0537_text_document -0.00013995190683390273 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0058_text_document -0.00015982388364521005 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0286_text_document -0.00021515127560185033 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1350_text_document -0.0001571023238460824 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0837_text_document -0.00013752792560160014 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0068_text_document -0.00016843360937336314 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1291_text_document -0.00015891629060925923 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1108_text_document -0.00013203126675290382 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0158_text_document -0.00016411457554263775 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0425_text_document -0.00021230579644619457 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1007_text_document -0.00015986075758075047 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0717_text_document -0.0001701948205017912 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1314_text_document -0.00015377327528875894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1097_text_document -0.0001371169603485991 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0131_text_document -6.898654632241831e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1425_text_document -0.00013787821806147992 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0050_text_document -8.926989271708098e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1432_text_document -0.0001697487861338437 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0257_text_document -0.00017584653056488438 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1487_text_document -0.00014710604306079943 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0793_text_document -0.00014185002801090756 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0655_text_document -0.0002048177532921539 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0339_text_document -0.00016120729340929332 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1109_text_document -0.0001223960473332774 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0151_text_document -0.0001453530879881725 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0830_text_document -0.00016900557843194865 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0912_text_document -0.0001762587422023036 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0700_text_document -0.0001701060296011042 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0959_text_document -0.00017187900714897894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1156_text_document -0.00014750294655244822 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0798_text_document -0.00021646525493979158 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1371_text_document -0.00014394646036576203 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0023_text_document -0.00018149198320960877 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1483_text_document -0.00021898530050654377 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1357_text_document -0.00014039681214543667 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0122_text_document -0.00015372532175386033 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0462_text_document -0.00014244364882458953 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1021_text_document -0.00018340422661701435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0562_text_document -0.00017297231732963866 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0505_text_document -0.00014815439743707823 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0787_text_document -6.566845033432497e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1420_text_document -0.00019458570097908857 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1399_text_document -0.00014141643718037182 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1024_text_document -0.00016580103135856077 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1296_text_document -0.00015323268199491331 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0454_text_document -0.0001652102562002521 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0844_text_document -0.00013734197357525123 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0133_text_document -0.00020796648585433443 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1377_text_document -0.0001451504261976597 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1037_text_document -0.00017540222926694344 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0705_text_document -0.0001441876443246492 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0623_text_document -0.00016583482656213415 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0950_text_document -0.00016662819288852212 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0547_text_document -0.00012275448077726155 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0240_text_document -0.00014899869630152795 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1011_text_document -0.00017669708053004115 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0893_text_document -0.0001380068262512076 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0117_text_document -0.00016438704208075348 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1178_text_document -0.00015566951718701304 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1120_text_document -0.0001968298289849741 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0346_text_document -0.00021831789053134852 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1354_text_document -0.00015828325931882951 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0880_text_document -0.00014054355631147654 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1019_text_document -0.00016980283350479704 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0477_text_document -0.0001554936668762301 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1231_text_document -0.00020725872023506206 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0660_text_document -0.00015766043466227753 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0471_text_document -0.00016385202931294238 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1168_text_document -0.00015936796236219736 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1214_text_document -0.00020362582892362142 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1378_text_document -0.00014733969185422145 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1142_text_document -0.00015761417290695306 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1102_text_document -0.0001462337959887598 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1015_text_document -6.628581021343156e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1438_text_document -0.00020969954548184154 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1372_text_document -0.00016810696279339656 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0945_text_document -0.00014940505169210177 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1248_text_document -0.0001528759123522868 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1119_text_document -0.00014503240257336312 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0064_text_document -0.00016497435403224923 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0973_text_document -0.00016079571489183747 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0509_text_document -0.00022369993639058988 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1135_text_document -0.0001420789135129733 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0019_text_document -0.00019132102172392381 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0663_text_document -0.00013840860111135537 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1025_text_document -0.00016229722476905113 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0261_text_document -6.379280220844888e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1431_text_document -0.00020113552848797584 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1403_text_document -0.000137631385661694 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0004_text_document -0.0001982410829408586 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1374_text_document -0.00017079185533477647 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1164_text_document -0.00015664816631886207 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0740_text_document -0.00016302101603996825 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1217_text_document -0.00017408551776591026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0387_text_document -0.00016607109177005895 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0911_text_document -0.0001657745682527523 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1203_text_document -0.00014511872575023014 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0995_text_document -0.00016493395371327803 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0535_text_document -0.00016439024152518683 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1319_text_document -0.00014733104613406654 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0795_text_document -0.0001690933348155181 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1184_text_document -0.00017257110857711312 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0915_text_document -0.00018430963487014087 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0576_text_document -0.00020464869311400685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0329_text_document -0.00017025858150337216 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0929_text_document -0.00013900029887391568 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0142_text_document -0.00016748130895251811 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0413_text_document -0.00016329165429188912 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0520_text_document -0.00021736941680849996 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1080_text_document -0.000152394024802439 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1334_text_document -0.0001613925886244472 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1207_text_document -0.00016307955820165195 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0869_text_document -0.0001441486009491774 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1012_text_document -0.00014737653916533006 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0475_text_document -0.00013080649821452762 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0167_text_document -0.00017285284583263907 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0494_text_document -0.0001698093675464998 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0264_text_document -0.00021815398376017308 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1358_text_document -0.00014901447037485513 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0641_text_document -0.00016660249038406657 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1318_text_document -0.0001633499708227947 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1194_text_document -0.00015045744553620076 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0059_text_document -0.00018084233263295714 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0578_text_document -0.00014157085008529747 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1095_text_document -0.00017857213682638632 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0692_text_document -6.764988138915539e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1422_text_document -0.00012506935057537142 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0195_text_document -0.00015870181027859 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0431_text_document -0.00017100696021143952 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0928_text_document -0.00016895043878239859 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0388_text_document -0.00018224812542148869 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0690_text_document -0.00015411896578328878 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0467_text_document -0.00017368407706494485 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0415_text_document -0.00014234118808321183 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0119_text_document -0.00012867800914124502 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0187_text_document -0.0001561855555345405 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0424_text_document -0.00016278956850041728 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0716_text_document -0.00016733080487519212 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0854_text_document -0.00016728745150059723 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0252_text_document -0.0001402355472850419 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0635_text_document -0.0001548607145565727 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0755_text_document -0.00016398300482079378 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0294_text_document -0.00015930313976780547 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0719_text_document -0.00014962011798590175 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0785_text_document -0.00017733082052674487 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0914_text_document -0.00021601718154650243 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1364_text_document -0.00014373735884721167 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0021_text_document -0.00015452708357587032 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1242_text_document -0.00012337326055085463 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0220_text_document -0.00013280279359938795 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0139_text_document -0.00016483325402012335 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1220_text_document -0.000205812101362281 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1383_text_document -0.00013902944382715256 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0102_text_document -6.614313574973613e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1424_text_document -0.00017765117832566445 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0921_text_document -0.00016689470232283466 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1215_text_document -0.0001598119671812959 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0722_text_document -0.00017121358295026171 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0908_text_document -0.00014648650042954557 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0805_text_document -0.00016928267062664605 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0414_text_document -0.0001293478705948442 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0190_text_document -0.00013665330263746718 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0089_text_document -0.00015126314438965768 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1150_text_document -0.000215044687396649 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1360_text_document -0.00015597172028226583 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0453_text_document -0.00013982208384806586 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0827_text_document -0.00014616926897068722 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1273_text_document -0.0001874960360394623 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0670_text_document -0.00016167036154720208 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0891_text_document -0.00016689874607763962 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1329_text_document -0.00015091546586234206 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1261_text_document -0.00015416991907938216 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1147_text_document -0.0001511400052643602 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1262_text_document -0.00020077913971010278 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0359_text_document -0.0001982479504648514 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0328_text_document -0.00015395483293007372 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1240_text_document -0.0001992056962588949 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1391_text_document -0.00012588895437678978 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0156_text_document -0.00014768793829420554 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0989_text_document -0.00016002155200097292 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1219_text_document -0.00016565416354685452 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0923_text_document -0.00014403017231763416 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0654_text_document -0.00017805971353518544 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0698_text_document -0.0001378765574189532 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0065_text_document -0.00017674772516323008 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1287_text_document -0.0001655522393554671 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0383_text_document -0.00016532529089363128 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0532_text_document -0.00015941871123680576 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0260_text_document -0.00021171945741461394 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1349_text_document -0.00021687351691044025 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1086_text_document -0.00015110950675208554 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0788_text_document -0.0001534703086589531 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0113_text_document -0.00022265313954357227 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1363_text_document -0.00018250509992474986 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0687_text_document -0.00014816523064762675 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1083_text_document -0.0001648579725487556 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0918_text_document -0.00018069138970376776 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0585_text_document -0.0001556370193604137 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0392_text_document -0.00017182194196636866 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0253_text_document -0.00022743636199465037 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1043_text_document -0.00015536366654460163 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0449_text_document -0.00018082637331676116 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0568_text_document -0.00015683285278217664 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0421_text_document -0.0001622994309867993 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0747_text_document -0.00016306397726827083 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1175_text_document -0.00020680722402286928 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1373_text_document -0.00016431500855232457 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1304_text_document -0.00018184329236177466 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1474_text_document -0.00017771217873467545 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1482_text_document -0.00019671885429181591 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1385_text_document -0.00022059042319588803 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0317_text_document -0.00017138124128438447 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0603_text_document -0.00016266615478246287 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0519_text_document -0.0002114083096384253 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1090_text_document -0.00013998264203286362 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0020_text_document -0.00017460079664023868 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1489_text_document -0.0001692744558612773 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0871_text_document -0.00017147492025024807 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0510_text_document -0.0001398439109182769 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0104_text_document -0.00014695579349499111 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1018_text_document -0.00012807718664438787 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0245_text_document -6.316427383145754e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1440_text_document -0.00015227044569712423 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0758_text_document -0.00015118448137770688 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1247_text_document -0.0001362583230994632 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1010_text_document -0.00016646881610647842 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1179_text_document -0.0001815942961628268 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0552_text_document -0.00016369442463502984 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0276_text_document -0.0001851410933126402 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0679_text_document -0.000181472654097742 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0571_text_document -0.0001418308891057867 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1017_text_document -0.00015801449786372084 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0301_text_document -0.00018119300347565183 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1477_text_document -6.557011502609391e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1450_text_document -0.00016703506632736244 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0712_text_document -0.00018720487080187376 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0676_text_document -0.00017695386933917195 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0577_text_document -0.00018219279917528516 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0684_text_document -0.00015972826222397787 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0881_text_document -0.000151526607808466 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0644_text_document -0.00013976918087641902 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0076_text_document -6.811248991716918e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1446_text_document -0.00014594750685791662 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0154_text_document -0.00016393685475270443 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0284_text_document -0.00014289503605711647 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0824_text_document -0.00012500509544238644 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0231_text_document -0.00017692829431610602 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0701_text_document -0.00014991877574473838 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0790_text_document -0.00016308321725598028 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1224_text_document -0.00011997958118530069 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1454_text_document -0.0001690606185783256 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0405_text_document -0.000126472844411905 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0177_text_document -0.0001644991846388191 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0267_text_document -0.00016724128112573786 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0944_text_document -0.0002082625574331926 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1368_text_document -0.00016585627592642267 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0389_text_document -0.00015349598882596164 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0610_text_document -0.00013592663525102173 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0128_text_document -0.00021224078026059514 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1085_text_document -0.0001515362474321984 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1057_text_document -0.00015458670560645793 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0459_text_document -0.00015120628046970142 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0027_text_document -0.00015419195982194006 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0759_text_document -0.0001599400000125394 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0458_text_document -0.00015861314731055416 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0472_text_document -0.00014721703828635846 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0816_text_document -0.00014278396140952358 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0022_text_document -0.00014157968059705966 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0832_text_document -0.00014203238461080756 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0097_text_document -0.00016518095127479354 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0407_text_document -0.00014681716865250086 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0820_text_document -0.00015472200307186366 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0777_text_document -0.00016821273444081198 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0941_text_document -0.00014175740138699298 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0821_text_document -0.00014715021773142656 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0796_text_document -0.00021088768534533313 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1022_text_document -0.00016819933400327033 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0967_text_document -0.00016916342679110154 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0548_text_document -0.00015025051932398203 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1117_text_document -0.0001294676654207488 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0197_text_document -0.00013135310501987096 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0175_text_document -0.00012481653918559684 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0218_text_document -0.00016966096770181028 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0397_text_document -0.00015715333715996273 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0850_text_document -0.00016087188062233635 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0710_text_document -0.0001855137252502281 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0592_text_document -0.00015757612907582822 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0447_text_document -0.00015566574734797269 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0607_text_document -0.00014981399371391565 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1249_text_document -0.00015975710844206455 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0436_text_document -0.00014657158083208687 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0784_text_document -0.00016481357518055052 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0728_text_document -0.00014575951693438 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0646_text_document -0.0001601547549358656 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1104_text_document -0.00015322997577936088 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0443_text_document -0.00016340194577505127 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1154_text_document -0.0001247206585146006 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0194_text_document -0.00016177516907430272 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1206_text_document -0.00015253760810690859 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1233_text_document -0.00012706329375937011 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0170_text_document -0.00015507046995311794 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0299_text_document -0.00017035702691942293 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0605_text_document -0.0001422899277036295 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0033_text_document -0.00013575208908613614 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0037_text_document -0.00014633884990467011 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0631_text_document -0.00016217601027429076 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0877_text_document -0.0001828175622258517 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0565_text_document -0.00015875144995229328 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0626_text_document -0.00015999193615630214 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0723_text_document -0.00014015883259485993 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0650_text_document -0.00017684620577768225 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0590_text_document -0.00017058179804425018 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0938_text_document -0.00015133984193190054 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1132_text_document -0.00017699012504638042 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1479_text_document -0.00020118545538965965 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0313_text_document -0.00014356204418389593 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0651_text_document -0.00017337432052813388 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0926_text_document -0.0001448346019979643 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1274_text_document -0.00014308591563402811 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0828_text_document -0.00017834500041411232 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0602_text_document -0.00014672692243329516 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0637_text_document -0.00015798629649839792 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0423_text_document -0.0001836509845586435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0681_text_document -0.00016952286767650987 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0411_text_document -0.00012067131551215888 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0223_text_document -0.00016217171674621445 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1186_text_document -0.0001765100136620177 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0591_text_document -6.519030577345491e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1441_text_document -0.00016620979558332153 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0533_text_document -0.00021068010638327337 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1366_text_document -0.00018190833539743673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1466_text_document -6.77656699149642e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1423_text_document -0.00015028582467435317 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1075_text_document -0.0001697901643366072 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0939_text_document -0.00015927162952712184 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1327_text_document -0.00014384103100023696 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0647_text_document -0.00015909349132024907 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0420_text_document -0.00020294618411948416 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0330_text_document -0.00014534131773300256 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1016_text_document -0.00017170180688925072 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0600_text_document -0.00016313482646074552 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0606_text_document -0.00016271942733732424 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1096_text_document -0.00012629894513046023 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0246_text_document -0.00016581582010228004 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0530_text_document -0.00021444766452209495 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0344_text_document -0.00018157852863279222 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1471_text_document -0.00015883581352585255 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0774_text_document -0.00015906208351593867 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0733_text_document -0.00016487949862750869 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0283_text_document -0.00017668531772068688 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0561_text_document -0.00015082627785148562 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0466_text_document -0.00020956294138859988 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1032_text_document -0.00015717617279975186 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1188_text_document -0.00016867575113490335 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1285_text_document -0.00013836362251826588 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0029_text_document -0.00021846973756306572 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1081_text_document -0.00014444136139636705 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0063_text_document -0.00015794892076000702 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0739_text_document -0.0001603185290361435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0954_text_document -0.00015790241442091907 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0448_text_document -0.0002247536546873563 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1331_text_document -0.00016016484830309278 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1218_text_document -0.00017826344323617095 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0688_text_document -0.00016344591624982424 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1169_text_document -0.0001518328612018163 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0799_text_document -0.00016015745212618693 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0866_text_document -0.00016772217928462375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0853_text_document -0.00016414184482721638 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0955_text_document -0.00016794059551858604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0910_text_document -0.0001707952950759885 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0490_text_document -0.00015763564099859615 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0861_text_document -0.00016629751203024328 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0948_text_document -0.00022519004424206347 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1336_text_document -0.00020131541623590195 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0349_text_document -0.00014739593235633784 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0808_text_document -0.00015088026723532896 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1277_text_document -0.00017138716747677106 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0601_text_document -0.00014199045022817607 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0044_text_document -0.0002123155378294079 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0315_text_document -0.00015126790565033696 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1230_text_document -0.00014892424536494257 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1255_text_document -0.00016219604515923145 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0271_text_document -0.00015407400817386168 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1051_text_document -0.00018740602831198903 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0481_text_document -0.00016238986966905872 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0970_text_document -0.00015825185377082692 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0412_text_document -0.0001462690665919911 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0040_text_document -0.0001480287233499889 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1068_text_document -0.00021390174927508646 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1091_text_document -0.00020261751949783834 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1394_text_document -0.00015379985797887985 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0862_text_document -0.00015632898789201956 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1461_text_document -6.978510057728695e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1453_text_document -0.0001625132732336281 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0070_text_document -0.00016494060418046228 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0852_text_document -0.00014125778850884905 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0012_text_document -0.0001670062832858936 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0484_text_document -0.00012910130489063797 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0140_text_document -0.00016942714359327365 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0919_text_document -0.00016560471114669703 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0288_text_document -0.00017267820151836504 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1490_text_document -0.00015923068318166766 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0863_text_document -0.00015934534716367842 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1129_text_document -0.00015526632017607095 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0737_text_document -0.00015166226718563977 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0791_text_document -0.00016072059902134906 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0884_text_document -0.0001803142413148948 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0708_text_document -0.00019507013635168262 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0667_text_document -6.797842868279661e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1452_text_document -0.00015425108365976967 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1048_text_document -0.0001557566535372043 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0609_text_document -0.0001486922014832451 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1049_text_document -0.00010314726295669035 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0336_text_document -0.00015110489846907377 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0800_text_document -0.0001282191763186718 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0196_text_document -0.00017192028765361552 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0931_text_document -0.00014468658775627455 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1072_text_document -0.00012350214919963946 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0207_text_document -0.00012823594249053026 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0144_text_document -0.00016124863235230142 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0395_text_document -0.0001460339674439275 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0648_text_document -0.00019945548397061228 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1389_text_document -0.00017712592386337505 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0582_text_document -0.00016961826143628697 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0433_text_document -0.00018215906534328343 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0553_text_document -0.00014668917620916662 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1088_text_document -0.00017591692917269525 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0391_text_document -0.00015546530486179318 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0304_text_document -0.00014159774188958436 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0110_text_document -0.00016661197161499275 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1310_text_document -0.0001647494786660025 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0398_text_document -0.00015207610020583033 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0446_text_document -0.00017874839079063128 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0574_text_document -0.00015464615502950034 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0772_text_document -0.00017998996521188286 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0686_text_document -0.00022374552824813425 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1335_text_document -8.318738874257729e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1417_text_document -0.0001980263180897782 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1388_text_document -0.0001634034913458812 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0855_text_document -0.00015856515338774624 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0457_text_document -0.00021503996494446354 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1093_text_document -0.0001750235089084395 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0903_text_document -0.00018680157180581995 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0669_text_document -0.00016797750803698952 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0860_text_document -0.0001526585274766112 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0754_text_document -0.00016251427729059063 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0289_text_document -0.00016658951661432627 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0946_text_document -0.00014846377243224343 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1275_text_document -0.00014532755538369286 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0024_text_document -0.00020085782167503894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0310_text_document -0.00016995517105782582 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1153_text_document -0.00017099512023635913 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0958_text_document -0.00016745325176526242 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0385_text_document -0.00015488747143365428 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1115_text_document -0.0001900892920247819 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0550_text_document -0.00014361854754586879 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0653_text_document -0.00015157203224797297 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1243_text_document -0.00021193298580903045 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1030_text_document -0.00015019624338905051 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0775_text_document -0.0001763798973561693 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0382_text_document -0.00020921420036595025 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1369_text_document -8.551501141789569e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1429_text_document -0.00018751712868149065 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1400_text_document -0.00016829537532338053 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1213_text_document -0.00015084066181358894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0709_text_document -0.00016195928581990262 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0842_text_document -0.00018160270203746462 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0691_text_document -0.0001466611510614814 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1066_text_document -0.00016057775736524703 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0766_text_document -0.00015331050001960014 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1139_text_document -0.0001640469483576742 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0499_text_document -0.0001750777552857391 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0699_text_document -0.00012850484180604435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0200_text_document -0.00012402353499660769 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0239_text_document -0.00016422654526752735 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0439_text_document -0.00017376594719277543 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1155_text_document -0.0001598186882709137 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0525_text_document -0.00016917681978282753 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0523_text_document -0.00015263696687634908 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0079_text_document -0.00016039189468868627 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0416_text_document -0.00017618800875841804 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1481_text_document -0.0001603990699730604 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0727_text_document -0.00016916333202617474 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0307_text_document -0.00012948524611971614 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0138_text_document -6.934197346191516e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1427_text_document -0.00017885898529592837 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1475_text_document -0.00015950113709324278 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0429_text_document -0.0001227475965565912 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0206_text_document -0.00016574764282947948 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0375_text_document -6.94467160073517e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1448_text_document -0.00021898407239821863 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1352_text_document -0.0001729576935220182 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0270_text_document -0.00014175309815679324 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0103_text_document -0.00015222256729727473 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0778_text_document -0.0002006873061783732 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1408_text_document -0.0001633023981526495 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1223_text_document -0.00018248734503882182 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1472_text_document -0.00014924831298511557 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0813_text_document -0.0002023904846194145 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1384_text_document -0.00015017176132147363 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0629_text_document -0.00010193584093009983 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0337_text_document -0.00016193099961740715 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0848_text_document -0.0001352750889245726 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0124_text_document -0.00015742420916536968 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0753_text_document -0.00019506168512525843 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1410_text_document -0.00012967156389355632 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0243_text_document -0.0001590087132629504 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0714_text_document -0.0001472438118601176 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1001_text_document -0.00016995459524808012 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1317_text_document -0.0001376594963234441 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0112_text_document -0.00014719759568221027 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1260_text_document -0.00016740162676634268 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0251_text_document -0.00014556055909872035 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1272_text_document -0.0001462316125597202 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0624_text_document -0.00017233201215418483 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1158_text_document -0.00016532348471925265 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0444_text_document -0.0001732052183826621 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0925_text_document -0.00014854511323026862 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1050_text_document -0.00015156958641033844 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1134_text_document -0.00017965845673990997 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0695_text_document -0.0001682718027734702 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0508_text_document -0.00014679933705210826 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0615_text_document -0.00016379095940965174 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0265_text_document -0.0001595953223745724 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1316_text_document -0.00014937910370135967 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1148_text_document -0.00016626192441576874 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1315_text_document -0.00012644557806052652 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0226_text_document -0.0002038866364425408 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0320_text_document -0.00012781772365927512 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0237_text_document -0.00014873714013968685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1353_text_document -0.00016647930161999792 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0287_text_document -0.00017174129317793987 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0262_text_document -0.0001538621049979398 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1067_text_document -0.0001563063688577774 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0430_text_document -0.0001379470171727049 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0001_text_document -0.00020088705868563327 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0360_text_document -0.00013316662932109192 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0017_text_document -0.0001901234116854142 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1468_text_document -0.00017072715683731255 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0501_text_document -0.0001482947975425305 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1141_text_document -0.0001427297599327488 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0998_text_document -0.00015030760729530093 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1145_text_document -0.00017934778278835527 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1473_text_document -0.0001624712517575163 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0396_text_document -0.00017056782495579372 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0913_text_document -0.00016750774701331024 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0380_text_document -0.00016144568293099387 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0544_text_document -0.0001916682997566083 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1411_text_document -0.00014869336257922915 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0106_text_document -0.0001578604687567487 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0442_text_document -0.00017245289000782427 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0418_text_document -0.00016980011307450558 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0491_text_document -0.00017116434783254355 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0368_text_document -0.00016266646479915197 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0296_text_document -0.00017164098089283116 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0483_text_document -0.00016127833957743735 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0536_text_document -0.00016314013780926182 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1306_text_document -0.00022395345242528445 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1347_text_document -0.00018132340700948072 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1467_text_document -0.00016629277085070321 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0969_text_document -0.0001452258606588068 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1036_text_document -0.00014466593622241737 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0010_text_document -0.00022820951721846 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1044_text_document -0.00014790724622100283 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0619_text_document -6.63903375071058e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1437_text_document -0.00015582467512476796 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0282_text_document -0.000166431942170938 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1162_text_document -0.00015151942372449155 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0750_text_document -0.00013878034925318548 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0126_text_document -0.0001276561557767454 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0192_text_document -0.00021753021732075337 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1362_text_document -6.56791423000064e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1435_text_document -0.00021714461635167153 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1359_text_document -0.00017732782166807323 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0685_text_document -0.00018223642150176753 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1469_text_document -0.00014959851925403812 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1338_text_document -0.00015949039029928227 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0273_text_document -0.00016354645980648047 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0060_text_document -0.00013120579158737794 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0229_text_document -0.00020283796144492631 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0358_text_document -0.00015607531075694943 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0743_text_document -0.00016785011193580181 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1294_text_document -0.0002289988842393063 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1136_text_document -0.00012609574092355686 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0155_text_document -0.00015391393870532928 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1122_text_document -0.00017291662195148216 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0707_text_document -0.00016930397761791359 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1301_text_document -0.0001503707315668261 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0814_text_document -0.00013755997103848938 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0083_text_document -0.0001641777724672735 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0303_text_document -0.00016249963588376947 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1208_text_document -0.00015091532371495184 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1116_text_document -0.00016925706424090195 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0512_text_document -0.00012331579081225924 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0205_text_document -0.00016572415150664923 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0847_text_document -0.00015896504445652774 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0464_text_document -0.0002172811514018951 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1365_text_document -0.0001467552821914243 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1266_text_document -0.00016053641120205404 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0278_text_document -0.00016092234700721057 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0075_text_document -0.00014739875025912542 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0201_text_document -0.00016162938474159285 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1308_text_document -0.00014983174047713567 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1235_text_document -0.00014507763071409317 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0056_text_document -0.0001514590704245521 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0061_text_document -0.00016877330395559598 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0917_text_document -0.00016571159380006375 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0401_text_document -0.00017140951259524858 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0617_text_document -0.00014775266341610126 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1236_text_document -0.0001618545976514836 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0974_text_document -0.00017574442993022852 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0693_text_document -0.00015133337648195428 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1245_text_document -0.0001641743334029538 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0524_text_document -0.00016161723136536173 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0541_text_document -0.00020025714719197222 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0351_text_document -0.00013146292989858402 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0099_text_document -0.00015213655142952746 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0614_text_document -0.0001594237312959875 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0403_text_document -0.00019928522546740278 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0321_text_document -0.0001718576209613083 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0370_text_document -0.00019870747890711344 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1379_text_document -0.00014150759607607492 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0084_text_document -0.00018497707327481416 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0666_text_document -0.0001494245626623757 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0783_text_document -0.00014567464546593923 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0028_text_document -0.00012921109005833406 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0208_text_document -0.00015522839502673537 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1244_text_document -0.00016889787876953445 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1300_text_document -0.00016128963916680415 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0883_text_document -0.00015501692461064587 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1239_text_document -0.00021246309426624894 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0308_text_document -0.00014578020908177723 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0035_text_document -0.00017141531649575344 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0486_text_document -0.00014815746443624483 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1271_text_document -0.00013641616762289285 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0078_text_document -0.00015052190193183982 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1127_text_document -0.0001719375436698932 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0377_text_document -0.00014287195470541003 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0652_text_document -6.855329371241895e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1443_text_document -0.0001610074517778759 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0711_text_document -0.0001617237746700382 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0838_text_document -0.00020562372967146788 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0322_text_document -0.00015409004826587622 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1222_text_document -0.00016204355350038122 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0290_text_document -0.00016418022845829302 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0857_text_document -0.00015404756921054644 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1062_text_document -0.000126591638339678 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0186_text_document -0.00015729247418431926 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1114_text_document -0.00014957377500349715 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1073_text_document -0.00014652199914549352 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0802_text_document -0.00015695341294768054 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1123_text_document -0.00016273826976360268 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0281_text_document -0.0001585705206926955 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0770_text_document -0.00015456957120516966 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0620_text_document -0.00017625472845639211 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0581_text_document -0.0001552911388847687 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0456_text_document -0.00014069857571193372 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0094_text_document -0.0001389384065464389 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0092_text_document -0.00013931998654942435 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0645_text_document -6.697683673653888e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1426_text_document -0.00021210844645545962 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0334_text_document -0.00014137119144044108 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0809_text_document -0.0001673120682697486 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0422_text_document -0.000160278712649626 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1170_text_document -0.00013434353209494217 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0153_text_document -0.0001256954694412835 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0147_text_document -0.00013924245755776423 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0100_text_document -0.0001472288250125752 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0636_text_document -0.00014097711532807256 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1026_text_document -0.00014773134717397182 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0639_text_document -0.00015352249953986432 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0081_text_document -0.00016123223192793427 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0720_text_document -0.00021482114302636226 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1094_text_document -0.00015645132444863934 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0381_text_document -0.00020944206443748354 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0342_text_document -0.0002062452865086777 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1387_text_document -0.00019918740595113046 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0347_text_document -0.00012991208314417127 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0191_text_document -0.00016986053451494 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0409_text_document -0.00015370758356460988 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1118_text_document -0.00012877020616415807 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0159_text_document -0.00017410552918525 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0485_text_document -0.00014130939816487588 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1009_text_document -0.00016995001697396185 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1165_text_document -0.00013186413368723246 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0116_text_document -0.0001780030357606211 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0586_text_document -0.00019946492842372164 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1381_text_document -0.00018695527893767175 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0668_text_document -0.0001509922737380354 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0992_text_document -0.00013508028066455278 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0073_text_document -0.00019538060272292147 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0664_text_document -0.00014994073210138526 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1264_text_document -0.00013272391894576728 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0183_text_document -0.00014297447569020546 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0822_text_document -0.00022036909181227284 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0338_text_document -0.00016213052378631979 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1200_text_document -0.00016695883561519947 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0479_text_document -0.00016163212750933134 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0972_text_document -0.00014238258162886823 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0826_text_document -0.0001495758517091787 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0780_text_document -0.00015672227745549565 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0432_text_document -0.00016811295148550867 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0373_text_document -0.00015381983397418742 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0767_text_document -0.00015443879289861833 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0469_text_document -0.0001563389422706874 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0779_text_document -0.0001249130802778805 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0217_text_document -0.0001867001004554042 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1177_text_document -0.0001666242847378148 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0364_text_document -0.00014083154963158667 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1014_text_document -0.00015687949743293156 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0461_text_document -0.00016106205104171245 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1209_text_document -0.00015977557248552813 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0976_text_document -0.00015626689249684247 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0804_text_document -0.00018295367015701234 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0549_text_document -0.00016294566957482463 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1180_text_document -0.00012895551040454305 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0120_text_document -0.00020400658875872852 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1401_text_document -0.00018195688820768198 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0559_text_document -0.00017126939272322272 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0933_text_document -0.00016370097379399474 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1157_text_document -0.00016306170313565844 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0114_text_document -0.00016611485489014685 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0480_text_document -0.00016347151406159876 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0964_text_document -0.00014631075526270524 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1023_text_document -0.0001766669845430746 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0580_text_document -0.0001530457703608652 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1053_text_document -0.00017282679878998538 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0266_text_document -0.00015993384615869773 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0438_text_document -0.00016368168822886812 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0875_text_document -0.0001642440794865544 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1286_text_document -0.00015130922676851358 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0123_text_document -0.00015810251823530065 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0729_text_document -0.00017183092026629049 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0937_text_document -0.00015814901712857293 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0295_text_document -0.00016113452432452787 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0277_text_document -0.00015619849162394083 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0069_text_document -0.00020959278805355666 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1006_text_document -0.00014410102715087833 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0994_text_document -0.00016078851749564712 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1100_text_document -0.00017027983908149928 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0960_text_document -0.00020003986430140833 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1407_text_document -0.00015782966699670828 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0741_text_document -0.00016914121141032142 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0374_text_document -0.00022097211894746266 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1065_text_document -0.0001571948997030719 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0730_text_document -0.00015974850807369602 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0306_text_document -0.00021789133441721216 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1074_text_document -0.0001233286077686087 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0222_text_document -0.0001618458158759695 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0008_text_document -0.0001615455933676868 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0763_text_document -0.00018103419979578447 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0563_text_document -0.00016396951255152545 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1172_text_document -0.00015073530126545124 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0632_text_document -0.00021341112078889535 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1092_text_document -0.00017307825526693754 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0906_text_document -0.00015218172857770168 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0613_text_document -0.00015344576003029675 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0890_text_document -0.0002173692545799705 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1070_text_document -0.00013765738216305341 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0136_text_document -0.0001734720773190499 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0947_text_document -0.00014938640060072316 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1238_text_document -0.00016249447142088974 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0882_text_document -0.00016869913054486763 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0495_text_document -0.00016113655951414623 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1128_text_document -0.00014335521205971165 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1279_text_document -0.00017876209492720022 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0575_text_document -0.00015697716171522857 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1201_text_document -0.00016786332713047465 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0404_text_document -0.00015265307759079717 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0773_text_document -0.00017523326611003096 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0898_text_document -0.0001617143182584976 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1321_text_document -0.00016606821407511515 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0846_text_document -0.00016912687076932746 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0518_text_document -0.00021366183110486437 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1367_text_document -0.00020204352923874265 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0345_text_document -0.00016502825564571993 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0015_text_document -0.00015942624346907078 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0451_text_document -0.00016489707368570657 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1297_text_document -0.00015383928605482828 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1241_text_document -0.00020098250321233543 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0987_text_document -0.00013164332599265317 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0150_text_document -0.00017734345224022555 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0696_text_document -0.00014991203547650442 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0807_text_document -0.0001485622941114972 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1035_text_document -6.618997286354476e-05 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1434_text_document -0.00014823391536404723 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0649_text_document -0.00019808432009137914 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1404_text_document -0.00015719535570287444 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1281_text_document -0.00015574089977064715 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0768_text_document -0.00015602103595017177 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0738_text_document -0.0001628543123496676 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1309_text_document -0.00017573876028003664 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1133_text_document -0.00022550434769031454 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1058_text_document -0.0001552331578668301 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0427_text_document -0.00015324611041067144 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0468_text_document -0.000170011687280753 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0500_text_document -0.0001447196970422825 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0148_text_document -0.0001784724948903193 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0386_text_document -0.0001498982424414089 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0616_text_document -0.0001623859542981665 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0428_text_document -0.00013361743194427285 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0118_text_document -0.00017779427607269958 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0569_text_document -0.00020986255379125862 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0323_text_document -0.00015466768584649975 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0718_text_document -0.00014830412557015837 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1252_text_document -0.00016505017860901167 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1191_text_document -0.00013490785926470453 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0143_text_document -0.00020564465624924356 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0314_text_document -0.00018728523619804526 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0671_text_document -0.00017721472581997018 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1486_text_document -0.00017590602804728252 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0596_text_document -0.0011586118073822192 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0012_text_document -0.001160584792279032 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0001_text_document -0.0011596589068878752 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0073_text_document -0.0011597796811325983 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0045_text_document -0.0011595856370794394 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0084_text_document -0.0011595638420318743 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0065_text_document -0.0011595056492598743 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0032_text_document -0.00019344370639184713 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0085_text_document -0.0011607702873631174 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0064_text_document -0.0011599628933925152 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0025_text_document -0.0011589412563486842 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0042_text_document -0.0011602895576833848 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0055_text_document -0.0011597331783997562 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0023_text_document -0.0011594813904417097 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0028_text_document -0.0011606546026116473 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0036_text_document -0.0011598078108047945 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0037_text_document -0.0011617470351964222 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0016_text_document -0.0011581358123008063 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0000_text_document -0.0011596262720494357 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0019_text_document -0.0011610223957263077 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0046_text_document -0.0011584882104731472 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0059_text_document -0.0011620862692660026 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0017_text_document -0.001161457666864065 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0072_text_document -0.0011590371889477892 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0033_text_document -0.0011609938621736805 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0006_text_document -0.001161141044100396 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0061_text_document -0.0011594671231655441 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0071_text_document -0.0011597802979499891 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0057_text_document -0.0011595547972095988 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0011_text_document -0.001158806362284141 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0047_text_document -0.0011602692107071176 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0004_text_document -0.0011585986122886333 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0009_text_document -0.001158546881103677 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0070_text_document -0.0011602619863458935 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0018_text_document -0.0011592879277164485 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0054_text_document -0.0011604245887073812 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0049_text_document -0.0011591629192252762 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0003_text_document -0.0011597122241032863 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0021_text_document -0.0011606137082184533 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0051_text_document -0.0011606825853273499 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0074_text_document -0.0011595027201270456 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0027_text_document -0.0011613556290724053 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0050_text_document -0.0011598527410031864 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0079_text_document -0.001158761645522561 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0022_text_document -0.0011593918929374918 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0030_text_document -0.0011580315931549376 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0034_text_document -0.0011604072238482566 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0020_text_document -0.001158231578949041 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0035_text_document -0.0011598418077399845 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0015_text_document -0.001159325008238741 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0066_text_document -0.0011599099580640463 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0044_text_document -0.0011584870238244551 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0010_text_document -0.0011599085999661118 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0002_text_document -0.0011589833818773999 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0041_text_document -0.0011606481795132088 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0067_text_document -0.001161458692060555 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0048_text_document -0.0011622006339082917 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0013_text_document -0.001159737838020273 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0083_text_document -0.0011590437070181112 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0053_text_document -0.0011634257187213065 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0008_text_document -0.0011590984828017124 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0014_text_document -0.0011602559046463836 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0069_text_document -0.001160367797520441 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0056_text_document -0.0011604031640501619 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0062_text_document -0.0011610001203209528 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0031_text_document -0.0011613029071807482 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0007_text_document -0.0011606767800460063 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0077_text_document -0.0011594886287987906 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0058_text_document -0.0011611587128695477 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0076_text_document -0.0011625946322648768 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0078_text_document -0.0011598797869962062 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0005_text_document -0.0011606009649910922 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0081_text_document -0.0011598277708955993 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0040_text_document -0.0011608718238032025 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0068_text_document -0.0011604076497221763 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0075_text_document -0.0011584462678902703 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0063_text_document -0.0011603135740733204 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0029_text_document -0.0011622164917135802 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0039_text_document -0.001160612435595214 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0026_text_document -0.0011604065740406324 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0052_text_document -0.0011627256914647336 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0024_text_document -0.0011591747252301023 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0043_text_document -0.0011607364688750981 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0060_text_document -0.0011606778657245907 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0082_text_document -0.0011588098597487147 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0080_text_document -0.0011601593642272241 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0038_text_document -0.001715327772970356 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0039_text_document -0.0017476947624003078 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0014_text_document -0.0017267559770325844 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0034_text_document -0.0011498438827029142 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0007_text_document -0.0017659420675981785 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0020_text_document -0.0017365986177600462 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0026_text_document -0.0017510246549559635 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0036_text_document -0.0017252368964000024 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0030_text_document -0.0017577211312850632 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0015_text_document -0.0017721505923411433 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0018_text_document -0.0017199608019077789 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0033_text_document -0.001763655692201186 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0027_text_document -0.0017368406475642309 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0023_text_document -0.0017159618112572714 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0024_text_document -0.0017364731392365761 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0009_text_document -0.0017439630879065747 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0025_text_document -0.0017500328397861851 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0010_text_document -0.0017865858138443973 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0032_text_document -0.0017566977721906304 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0029_text_document -0.0017371397837150156 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0021_text_document -0.0017582147893048033 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0040_text_document -0.0011272603991442094 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0000_text_document -0.0017149137099469502 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0013_text_document -0.0011524697523897062 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0005_text_document -0.0017728444816641966 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0022_text_document -0.0017617078558540117 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0011_text_document -0.0017090530525973265 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0038_text_document -0.0011492816454542877 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0003_text_document -0.0017362843828160517 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0019_text_document -0.0017368328836137243 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0031_text_document -0.0017467857780397841 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0012_text_document -0.0017828246319376343 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0041_text_document -0.0011581997009018688 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0004_text_document -0.0011523562998874855 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0001_text_document -0.0017729287558360615 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0037_text_document -0.0011559024877821585 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0006_text_document -0.001756650413292843 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0016_text_document -0.001156615396883475 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0002_text_document -0.0017153661936226234 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0017_text_document -0.001764036695919234 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0028_text_document -0.001753337750065643 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0035_text_document -0.0012705251979895095 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0008_text_document -0.00269701351832541 /eagle/datasets//dolma/data_Llama2Tokenizer/gutenberg-books//books-0001_text_document -0.002871694450382552 /eagle/datasets//dolma/data_Llama2Tokenizer/gutenberg-books//books-0000_text_document -0.0009312920312920378 /eagle/datasets//dolma/data_Llama2Tokenizer/gutenberg-books//books-0002_text_document -0.003491293635742718 /eagle/datasets//dolma/data_Llama2Tokenizer/wiki-en-simple//en_simple_wiki-0000_text_document -0.0018087063642572823 /eagle/datasets//dolma/data_Llama2Tokenizer/wiki-en-simple//en_simple_wiki-0001_text_document -0.002558997095701873 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_35_text_document -0.0030813105581653888 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_33_text_document -0.0019635118705125343 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_59_text_document -0.004901530213612799 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_3_text_document -0.003793853523990452 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_25_text_document -0.0025955935796863213 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_13_text_document -0.002786190590856374 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_43_text_document -0.003860604753313106 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_30_text_document -0.00262527053779086 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_53_text_document -0.0025961058890461132 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_12_text_document -0.002517804312074853 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_47_text_document -0.001965319952716967 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_57_text_document -0.0024031443284573315 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_49_text_document -0.0021544653036229956 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_28_text_document -0.001966106318481444 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_60_text_document -0.004942382135998647 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_5_text_document -0.004476005981762131 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_38_text_document -0.003703151369852988 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_18_text_document -0.0035249004189965697 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_8_text_document -0.002202908872645994 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_29_text_document -0.0019659892928062975 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_58_text_document -0.0033082157671442004 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_26_text_document -0.004728972404877124 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_39_text_document -0.004933803932309063 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_1_text_document -0.004932534365989358 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_2_text_document -0.002416452767493874 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_48_text_document -0.002620239890699942 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_54_text_document -0.004920228925189318 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_6_text_document -0.003526184767940545 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_10_text_document -0.003478586000712196 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_23_text_document -0.004325225087180493 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_41_text_document -0.0029001545338120283 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_21_text_document -0.003235865499567488 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_24_text_document -0.003708174490218723 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_17_text_document -0.0037061510536654098 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_20_text_document -0.0034825398408553174 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_22_text_document -0.0023953996094720226 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_50_text_document -0.0026860110123915503 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_46_text_document -0.002775825753828905 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_37_text_document -0.0031227832671625543 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_42_text_document -0.0029663524802271864 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_27_text_document -0.0038380125626091056 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_61_text_document -0.002556361839801216 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_36_text_document -0.0025955479628912517 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_14_text_document -0.0010043379135763284 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_62_text_document -0.0037134182490424516 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_19_text_document -0.0019352359233920818 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_55_text_document -0.002732128376890259 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_11_text_document -0.004389286673762447 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_7_text_document -0.0025959811034233144 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_15_text_document -0.004279327177425728 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_31_text_document -0.0049375674289130365 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_4_text_document -0.004749708406741731 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_40_text_document -0.0025599583023118502 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_34_text_document -0.0026884724043408277 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_44_text_document -0.0019620663976446513 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_56_text_document -0.002683235274555792 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_45_text_document -0.004312637092757081 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_0_text_document -0.0035302801429202123 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_9_text_document -0.0024212069391470387 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_51_text_document -0.00401255756160585 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_32_text_document -0.003602075549037339 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_16_text_document -0.002645748366381084 /eagle/datasets/dolma/data_Llama2Tokenizer_stackcode_fused_5GB/fused_52_text_document diff --git a/ALCF_utils/helpers_alcf.sh b/ALCF_utils/helpers_alcf.sh deleted file mode 100644 index b70ac43407..0000000000 --- a/ALCF_utils/helpers_alcf.sh +++ /dev/null @@ -1,176 +0,0 @@ -#!/bin/bash --login - -ezpz() { - if [[ ! -d ezpz ]]; then - git clone https://github.com/saforem2/ezpz - else - echo "Found ezpz!" - fi - if python3 -c 'import ezpz; print(ezpz.__file__)' 2> '/dev/null'; then - echo "Has ezpz installed. Nothing to do." - else - echo "Does not have ezpz installed. Installing..." - echo "Using $(which python3) to install \`ezpz\`:" - python3 -m pip install -e ezpz > ezpz-install.log 2>&1 - fi - source ezpz/src/ezpz/bin/savejobenv > /tmp/savejobenv.log 2>&1 || exit - source ezpz/src/ezpz/bin/getjobenv || exit -} - -saveDSenv() { - echo "Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env" - { - echo "PATH=${PATH}" ; - echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" ; - echo "http_proxy=${http_proxy}" ; - echo "https_proxy=${https_proxy}" ; - echo "CFLAGS=${CFLAGS}" ; - echo "PYTHONUSERBASE=$PYTHONUSERBASE" ; - } > .deepspeed_env -} - -# makeDSenv() { -# saveDSenv -# } - - -# makeDSenv() { -# echo "PATH=${PATH}" > .deepspeed_env -# echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> .deepspeed_env -# echo "http_proxy=${http_proxy}" >> .deepspeed_env -# echo "https_proxy=${https_proxy}" >> .deepspeed_env -# echo "CFLAGS=${CFLAGS}" >> .deepspeed_env -# echo "PYTHONUSERBASE=$PYTHONUSERBASE" >> .deepspeed_env -# } - -sumWeights() { - local file_list=$1 - weights=$(cat "${file_list}" | awk '{print $1}' | tr '\n' '\ ,\ ' | sed 's/^/[/g' | sed 's/$/]/g' | tr '\ ' "\,\ ") - # weights=$(echo "$weights" | tr ",]" "]") - # echo "weights: $weights" - python3 -c "import numpy as np; print(np.sum(${weights}))" -} - -sumFiles() { - local rd=$1 - for f in $("${rd}/*.txt"); do - ws=$(sumWeights "${rd}/${f}") - echo "sum($f.weights)=${ws}" - done -} - -# setupData() { -# cidx=$1 -# echo "Caught DOLMA_CHUNK_IDX: ${cidx} !!" -# dfl="./chunks-reweighted/10/data_file_list_chunk_${cidx}_of_10.txt" -# if [[ -z "${DATA_FILE_LIST}" ]]; then -# DATA_FILE_LIST="${dfl}" -# else -# echo "Caught DATA_FILE_LIST: ${DATA_FILE_LIST} from ENV!!" -# fi -# NDOCS=$(wc -l < "${DATA_FILE_LIST}") && export NDOCS="${NDOCS}" -# WEIGHT_SUM="$(sumWeights "${DATA_FILE_LIST}")" -# export WEIGHT_SUM="${WEIGHT_SUM}" -# export NDOCS="${NDOCS}" -# echo "Using DATA_FILE_LIST: ${DATA_FILE_LIST} with ${NDOCS} documents" -# echo "WEIGHT SUM: ${WEIGHT_SUM}" -# data_file_list_stem=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") -# export DOLMA_CHUNK_IDX="${cidx}" -# export DATA_FILE_LIST_STEM="${data_file_list_stem}" -# export DATA_CACHE_PATH=".cache/${data_file_list_stem}/index-cache" -# mkdir -p "${DATA_CACHE_PATH}" -# } -# - - -setEnv() { - if [[ $(hostname) == x4* ]]; then - SETENV_FILE="${HOME}/anl_24_release_q4/llm.devkit/setenv.sh" - if [[ "${SETENV_FILE}" ]]; then - # shellcheck source=/home/foremans/anl_24_release_q4/llm.devkit/setenv.sh - source "${HOME}/anl_24_release_q4/llm.devkit/setenv.sh" - else - echo "Unable to source ${SETENV_FILE}, exiting!" - exit - fi - elif [[ $(hostname) == x3* ]]; then - # ---- load conda ----------------------------------- - module load conda/2023-10-04; conda activate base - if [[ "${VIRTUAL_ENV}" ]]; then - echo "Caught VIRTUAL_ENV = ${VIRTUAL_ENV} from environment!!" - else - echo "Not using VIRTUAL_ENV" - # sourceFile "${HERE}/venvs/polaris/2023-10-04/bin/activate" || exit - fi - else - echo "Unknown hostname $(hostname)" - exit 1 - fi -} - -makeHostfiles() { - GPUS_PER_NODE=$(python3 -Wignore -c 'import ezpz; print(ezpz.get_gpus_per_node())') - export GPUS_PER_NODE="${GPUS_PER_NODE}" - # ---- Make MPICH hostfile ---------------- - export hostfile_mpich=hostfile_mpich - cat "$PBS_NODEFILE" > "${hostfile_mpich}" - # ---- Make DeepSpeed hostfile ------------------- - export hostfile_deepspeed=hostfile_deepspeed - cat "$PBS_NODEFILE" > "${hostfile_deepspeed}" - sed -e "s/$/ slots=${GPUS_PER_NODE}/" -i "${hostfile_deepspeed}" -} - - -setupData() { # dfl: abbrv. for DATA_FILE_LIST - dfl=$1 - printf "Calling: \`setupData()\` with %s\n" "${dfl}" - ndocs=$(wc -l < "${dfl}") - ws=$(sumWeights "${dfl}") - dfl_stem=$(echo "${dfl}" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") - dcp="${HERE}/.cache/${dfl_stem}/index-cache" - mkdir -p dcp - export DATA_FILE_LIST="${dfl}" - export NUM_DOCS="${ndocs}" - export WEIGHT_SUM="${ws}" - export DFL_STEM="${dfl_stem}" - export DATA_CACHE_PATH="${dcp}" - echo "--------------------" - echo "Updated environment:" - printf "DATA_FILE_LIST: %s\n" "${DATA_FILE_LIST}" - printf "NUM_DOCS: %s\n " "${NUM_DOCS}" - printf "WEIGHT_SUM: %s\n" "${WEIGHT_SUM}" - printf "DFL_STEM: %s\n" "${DFL_STEM}" - printf "DATA_CACHE_PATH: %s\n" "${DATA_CACHE_PATH}" - echo "--------------------" -} - -printBlack() { - printf "\e[1;30m%s\e[0m\n" "$@" -} - -printRed() { - printf "\e[1;31m%s\e[0m\n" "$@" -} - -printGreen() { - printf "\e[1;32m%s\e[0m\n" "$@" -} - -printYellow() { - printf "\e[1;33m%s\e[0m\n" "$@" -} - -printBlue() { - printf "\e[1;34m%s\e[0m\n" "$@" -} - -printMagenta() { - printf "\e[1;35m%s\e[0m\n" "$@" -} - -printCyan() { - printf "\e[1;36m%s\e[0m\n" "$@" -} -printWhite() { - printf "\e[1;37m%s\e[0m\n" "$@" -} From 7e6e2637e4132834700aa935a43add65ce56be97 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 7 Mar 2024 10:29:35 -0600 Subject: [PATCH 100/268] Add (new) `ALCF/README.md` --- ALCF/README.md | 134 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 ALCF/README.md diff --git a/ALCF/README.md b/ALCF/README.md new file mode 100644 index 0000000000..bc2314b914 --- /dev/null +++ b/ALCF/README.md @@ -0,0 +1,134 @@ +# Megatron-DeepSpeed @ ALCF + + +## Polaris + +### Install + +1. Clone [`argonne-lcf/Megatron-DeepSpeed`](https://github.com/argonne-lcf/Megatron-DeepSpeed) + + ```bash + [#](#.md) ---- 0. Clone + navigate into `Megatron-DeepSpeed`: + $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed + $ cd Megatron-DeepSpeed + ``` + +2. Create `conda` env: + + ```bash + # ---- 1. Create conda env ------------------------------------------------- + $ module load conda/2023-10-04 #; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06 + $ MPICC="cc -shared -taret-accel=nvidia80" + $ DAY=$(date "+%Y-^m-%d") + $ conda create --solver libmamba -c pytorch -c nvidia --name "${DAY}" "python==3.10" + $ export PYTHONUSERBASE="${HOME}/.local/polaris/conda/${DAY}" + # --------------------------------------------------------------------------- + ``` + +3. Install dependencies: + + ```bash + # -------- a. Install dependencies ------------------------------------------ + $ conda install -c pytorch -c nvidia --solver libmamba mpi4py pytorch-cuda=11.8 ninja torchvision torchaudio pytorch-cuda=11.8 transformers xformers triton + $ python3 -m pip install --upgrade pip pybind11 toolong appdirs wandb sentencepiece ipython setuptools wheel ninja + $ python3 -m pip install --upgrade deepspeed wandb + ``` + + 1. Install `apex`: + + ```bash + # ------------ i. Install `apex` -------------------------------------------- + $ git clone https://github.com/NVIDIA/apex + $ cd apex + $ module swap gcc gcc/10.3.0 + $ python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ + ``` + + 2. Install `ezpz`: + + ```bash + # ------------ ii. Install `ezpz` -------------------------------------------- + $ git clone https://github.com/saforem2/ezpz + $ python3 -m pip install -e "ezpz[dev]" + # --------------------------------------------------------------------------- + ``` + +### Running + +- The (shell) script used to launch pre-training is: + - Polaris: + [`train_llama_alcf_polaris.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/train_llama_alcf_polaris.sh) + - Aurora: + [`train_llama_alcf_aurora.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/train_llama_alcf_aurora.sh) + +- These shell script(s) will set the appropriate environment variables, load the correct conda +modules and launch +[`pretrain_gpt_alcf.py`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/pretrain_gpt_alcf.py) using `deepspeed` + + +Explicitly, we can: + +```bash +# 1. Launch interactive job +$ qsub -A -q debug -l select=2 -l walltime=01:00:00,filesystems=eagle:home -I +# 2. Load conda environment +$ module load conda/2023-10-04 ; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06 +$ export PYTHONUSERBASE=/home/foremans/.local/polaris/conda/2024-03-06 +# 3. Navigate into `Megatron-DeepSpeed` directory +$ cd Megatron-DeepSpeed +# 4. Launch: +$ bash train_llama_alcf_polaris.sh +``` + +
[Output] + +```bash +source-ing /lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/ALCF/helpers_alcf.sh + +CommandNotFoundError: Your shell has not been properly configured to use 'conda deactivate'. +To initialize your shell, run + + $ conda init + +Currently supported shells are: + - bash + - fish + - tcsh + - xonsh + - zsh + - powershell + +See 'conda init --help' for more information and options. + +IMPORTANT: You may need to close and restart your shell after running 'conda init'. + + +Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env +Found ezpz! +/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/ezpz/src/ezpz/__init__.py +Has ezpz installed. Nothing to do. +┌────────────────────────────────────────────────────────────────── +│ [Hosts]: +│ • [host:0] - x3005c0s37b0n0.hsn.cm.polaris.alcf.anl.gov +│ • [host:1] - x3005c0s37b1n0.hsn.cm.polaris.alcf.anl.gov +└────────────────────────────────────────────────────────────────── +┌────────────────────────────────────────────────────────────────── +│ [DIST INFO]: +│ • Loading job env from: /home/foremans/.pbsenv +│ • HOSTFILE: /var/spool/pbs/aux/1777928.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov +│ • NHOSTS: 2 +│ • NGPU_PER_HOST: 4 +│ • NGPUS (NHOSTS x NGPU_PER_HOST): 8 +│ • WORLD_SIZE: 8 +│ • DIST_LAUNCH: mpiexec --verbose --envall -n 8 -ppn 4 --hostfile /var/spool/pbs/aux/1777928.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov +└────────────────────────────────────────────────────────────────── +┌────────────────────────────────────────────────────────────────── +│ [Launch]: +│ • Use: 'launch' (=mpiexec --verbose --envall -n 8 -ppn 4 --hostfile /var/spool/pbs/aux/1777928.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov) +│ to launch job +└────────────────────────────────────────────────────────────────── + +# [...] +``` + +
From 7133eb63212ba6b2773313df8029e33532ba55be Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 7 Mar 2024 10:29:47 -0600 Subject: [PATCH 101/268] Update `train_llama_alcf_polaris.sh` --- train_llama_alcf_polaris.sh | 290 ++++++++++-------------------------- 1 file changed, 81 insertions(+), 209 deletions(-) diff --git a/train_llama_alcf_polaris.sh b/train_llama_alcf_polaris.sh index b990cc124d..e4b4ef4f43 100644 --- a/train_llama_alcf_polaris.sh +++ b/train_llama_alcf_polaris.sh @@ -5,7 +5,6 @@ #PBS -l select=48 #PBS -l filesystems=eagle:home - function sourceFile() { fp="$1" echo "source-ing ${fp}" @@ -17,220 +16,94 @@ function sourceFile() { fi } -# +++++++++++++++ SCRIPT START ++++++++++++++++++++++ +# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +# ---- 0. Navigate into `$PBS_O_WORKDIR` ------------------------------------- cd "${PBS_O_WORKDIR}" || exit HERE=$(python3 -c 'import os; print(os.getcwd())') -sourceFile "${HERE}/ALCF_utils/helpers_alcf.sh" || exit - -# ---- fns from ./helpers_alcf.sh ------------------- -setEnv || exit -saveDSenv || exit -ezpz || exit -makeHostfiles || exit -DFL="/eagle/datasets/dolma/data_file_list_reweighted.txt" -setupData "${DATA_FILE_LIST:-${DFL}}" || exit - -echo "Using $(which python3)" - -# mkdir -p "${DATA_CACHE_PATH}" -# if [[ -n "${DOLMA_CHUNK_IDX}" ]]; then -# echo "Using DOLMA CHUNK ${DOLMA_CHUNK_IDX} from ${DATA_FILE_LIST} with ${NUM_DOCS} documents..." -# else -# echo "Using NUM_DOCS=${NUM_DOCS} documents from DATA_FILE_LIST=${DATA_FILE_LIST}" -# fi -# echo "DOCUMENT WEIGHT_SUM: ${WEIGHT_SUM}" -# ---------------------------------------------------- - -# ---- Parallelism Settings -------------------------- -PP=${PP:-1} -TP=${TP:-2} -export PP="${PP}" -export TP="${TP}" -export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" -export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} -# ---------------------------------------------------- - -# ---- Llama2 7B Config ------------------------------ -export MODEL_KEY="Llama-7B" -export HEADS=${HEADS:-32} -export NLAYERS=${NLAYERS:-32} -export HIDDEN=${HIDDEN:-4096} -export NUM_KV_HEAD=${NUM_KV_HEAD:-8} -export FFN_HIDDEN_SIZE=${FFN_HIDDEN_SIZE:-11008} -# ---------------------------------------------------- - -# ---- Run Settings ---------------------------------- -export LR=${LR:-0.0003} -export SEQ=${SEQ:-4096} # SEQ_LEN: 4096 -export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 -export ZERO_STAGE=${ZERO_STAGE:-2} -export MICRO_BATCH=${MICRO_BATCH:-8} -export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} -export EVAL_ITERS="${EVAL_ITERS:-10}" -# export TRAIN_ITER="${TRAIN_ITER:-320000}" -export TRAIN_ITER=${TRAIN_ITER:-317892} -export EVAL_INTERVAL="${EVAL_INTERVAL:-50000}" -export SAVE_INTERVAL=${SAVE_INTERVAL:-200} -export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} -# export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} -export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) -export TOKENIZER_MODEL="${TOKENIZER_MODEL:-"/eagle/datasets/dolma/utils/tokenizer.model"}" -export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" -export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" -# ---------------------------------------------------- - -echo "++++++++++++++++++++++++++++++++++++++++++++++++++" -echo "- WORLD_SIZE:${WORLD_SIZE}" -echo "- NCCL: ${NCCL:-nccl}" -echo "- MODEL_TYPE: ${MODEL_TYPE}" -echo "- Using DATA_FILE_LIST: ${DATA_FILE_LIST}" -echo "++++++++++++++++++++++++++++++++++++++++++++++++++" - -# +++++NOTES ++++++++++++++++++++++++++++++++++++++++++++++++++ -# XXX: -# - need to merge *.json files -# - Can we create indices on a per-dataset basis? -# (i.e. one for common-crawl, one for stack-code, etc.) -# - Aggregate `stack-code/**/{*.bin,*.idx}` -# -# - Given: {f1.bin,f2.bin,...,fn.bin} -# - tot_tokens = 0 -# - agg = [] -# - Start: -# - read: f1.bin -# - tot_tokens += sum(tokens(f1.bin)) -# - if tot_tokens < needed_tokens: -# - agg.append(f1.bin) -# - else: -# - -# TODO: -# - StackExchange ~ 500B total, using 80% ~ 400B tokens -# - figure out how to deal with MANY small files (e.g. stack-code) -# - Add logic for determining `train_iters` dynamically from `data-file-list` -# (which specifies a single _chunk_) -# - get script from Varuni -# - should: -# - take in a `data_file_list.txt` -# - return number of training iterations -# -# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -# ---- Build DeepSpeed Config --------------------------------- -export DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" -bash "${HERE}/generate_config.sh" "${DS_CONFIG}" || exit 1 -# ------------------------------------------------------------- - -# ---- Specify output location -------------------------------- -export OUTPUT_PREFIX="ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" -# OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME} -OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" -export OUTPUT_DIR="${OUTPUT_DIR}" -export OUTPUT_LOG="${OUTPUT_DIR}/output.log" -export CKPT_DIR="checkpoints/${OUTPUT_PREFIX}" -echo "${OUTPUT_LOG}" >> "logs/latest" -mkdir -p "${OUTPUT_DIR}" -echo "!!!Please see logs at ${OUTPUT_DIR}" - -# ---- Setup DeepSpeed arguments -------------------------------- -ds_args=" " -ds_args=" --deepspeed ${ds_args}" -if [[ $PP == 1 ]]; then - ds_args=" --no-pipeline-parallel ${ds_args}" -fi -ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" -ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" - -if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then - echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" - ds_args=" --deepspeed-activation-checkpointing ${ds_args}" - # --checkpoint-activations \ - # --deepspeed-activation-checkpointing -fi -# --------------------------------------------------------------- - -gpt_args=() - -# we are now using activation checkpoint provided by megatron, see below. -# ds_args=" --deepspeed-activation-checkpointing ${ds_args}" -if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then - echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" - gpt_args+=( - "--checkpoint-activations" - "--checkpoint-num-layers 1" - ) -fi - -# take custom args +export HERE +# ---- 1. Assert `./pretrain_gpt_alcf.py` exists: ----------------------------- +export EXEC="${HERE}/pretrain_gpt_alcf.py" +[ -f "${EXEC}" ] || exit +# ---- 2. `source ./ALCF/helpers_alcf.sh`: ------------------------------------ +sourceFile "${HERE}/ALCF/helpers.sh" || exit +# ---- 3. Call fns from `./ALCF/helpers_alcf.sh` ------------------------------ +setEnv || exit # 1. load `conda` environment +saveDSenv || exit # 2. save env vars to `.deepspeed_env` +ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars +makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` +setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` +buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ +setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} +setArgs || exit # 8. specify additional `deepspeed` arguments +setData || exit # 9. specify `DATA_FILE_LIST` for dolma dataset +setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` +printJobInfo || exit # 11. print job info +# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +# Take custom args custom_args=" $@" -# launcher setting -hfds="${HERE}/hostfile_deepspeed" -hfmpi="${HERE}/hostfile_mpich" -[ -f "$hfds" ] || exit -[ -f "$hfmpi" ] || exit - -LAUNCHER=${LAUNCHER:-MPICH} -if [[ $LAUNCHER == "deepspeed" ]]; then - launcher="" -else - launcher="--force_multi --hostfile $hfds --launcher=${LAUNCHER} --launcher_args='-hostfile ${hfmpi}'" -fi - -NCCL=${NCCL:-nccl} -EXEC="pretrain_gpt_alcf.py" +# Assert `./hostfile_deepspeed` exists +export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit # --vocab-file $VOCAB_FILE \ # --merge-file $MERGE_FILE \ # --lr-decay-iters 320000 \ - # --lr-warmup-iters 5000 \ - # --lr-decay-iters 10000 \ - # --num-workers 4 \ - # launch python3 ${EXEC} \ - # --data-impl mmap \ -run_cmd=" - deepspeed $launcher ${EXEC} \ - --$DTYPE \ - --split 100,0,0 \ - --use-flash-attn-v2 \ - --no-bias-gelu-fusion \ - --lr-decay-style cosine \ - --no-bias-dropout-fusion \ - --no-masked-softmax-fusion \ - --tokenizer-type Llama2Tokenizer \ - --no-gradient-accumulation-fusion \ - --accumulate-allreduce-grads-in-fp32 \ - --use-checkpoint-opt_param-scheduler \ - --lr ${LR} \ - --log-interval 1 \ - --num-workers 0 \ - --seq-length $SEQ \ - --save ${CKPT_DIR} \ - --load ${CKPT_DIR} \ - --num-layers ${NLAYERS} \ - --hidden-size ${HIDDEN} \ - --train-iters ${TRAIN_ITER} \ - --eval-iters ${EVAL_ITERS} \ - --distributed-backend ${NCCL} \ - --num-attention-heads ${HEADS} \ - --save-interval ${SAVE_INTERVAL} \ - --eval-interval ${EVAL_INTERVAL} \ - --max-position-embeddings ${SEQ} \ - --micro-batch-size ${MICRO_BATCH} \ - --data-file-list ${DATA_FILE_LIST} \ - --tensor-model-parallel-size ${TP} \ - --global-batch-size ${GLOBAL_BATCH} \ - --pipeline-model-parallel-size ${PP} \ - --num-key-value-heads ${NUM_KV_HEAD} \ - --data-cache-path ${DATA_CACHE_PATH} \ - --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ - --tokenizer-model ${TOKENIZER_MODEL} \ - $ds_args \ - ${LLAMA_ARGS} \ - ${gpt_args[*]} \ - $custom_args \ - |& tee ${OUTPUT_LOG} - " +# --lr-warmup-iters 5000 \ +# --lr-decay-iters 10000 \ +# --num-workers 4 \ +# launch python3 ${EXEC} \ +# --data-impl mmap \ +# source ./ezpz/src/ezpz/bin/getjobenv || exit + # ${DIST_LAUNCH} ./local_rank.sh python3 ${EXEC} \ + # ${DIST_LAUNCH} python3 ${EXEC} \ + # deepspeed $launcher ${EXEC} \ +# run_cmd=" +# deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} \ +# |& tee ${OUTPUT_LOG} +# " + # --$DTYPE \ + # --num-workers 0 \ + # --split 100,0,0 \ + # --log-interval 1 \ + # --use-flash-attn-v2 \ + # --no-bias-gelu-fusion \ + # --lr-decay-style cosine \ + # --no-bias-dropout-fusion \ + # --no-masked-softmax-fusion \ + # --tokenizer-type Llama2Tokenizer \ + # --no-gradient-accumulation-fusion \ + # --accumulate-allreduce-grads-in-fp32 \ + # --use-checkpoint-opt_param-scheduler \ + # --lr ${LR} \ + # --seq-length $SEQ \ + # --save ${CKPT_DIR} \ + # --load ${CKPT_DIR} \ + # --num-layers ${NLAYERS} \ + # --hidden-size ${HIDDEN} \ + # --train-iters ${TRAIN_ITER} \ + # --eval-iters ${EVAL_ITERS} \ + # --distributed-backend ${NCCL} \ + # --num-attention-heads ${HEADS} \ + # --save-interval ${SAVE_INTERVAL} \ + # --eval-interval ${EVAL_INTERVAL} \ + # --max-position-embeddings ${SEQ} \ + # --micro-batch-size ${MICRO_BATCH} \ + # --data-file-list ${DATA_FILE_LIST} \ + # --tensor-model-parallel-size ${TP} \ + # --global-batch-size ${GLOBAL_BATCH} \ + # --pipeline-model-parallel-size ${PP} \ + # --num-key-value-heads ${NUM_KV_HEAD} \ + # --data-cache-path ${DATA_CACHE_PATH} \ + # --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + # --tokenizer-model ${TOKENIZER_MODEL} \ + # $ds_args \ + # ${LLAMA_ARGS} \ + # ${gpt_args[*]} \ + # $custom_args \ + +run_cmd="deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} ${CLI_ARGS} |& tee ${OUTPUT_LOG}" + # >> ${OUTPUT_LOG} 2>&1 & # >> ${OUTPUT_LOG} 2>&1 & # |& tee $OUTPUT_DIR/output.log @@ -240,8 +113,7 @@ echo "All DeepSpeed(s): $(which -a deepspeed)" echo "Using $(which deepspeed)" ds_report -echo "${run_cmd}" - +echo "[RUNNING]: ${run_cmd}" printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" # echo "${OUTPUT_LOG}" From eb89ee4c1516b93d5f807bb699ae81c0bc3ac097 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 7 Mar 2024 10:31:27 -0600 Subject: [PATCH 102/268] Update `ALCF/README.md` --- ALCF/README.md | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/ALCF/README.md b/ALCF/README.md index bc2314b914..31bf4b759d 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -1,6 +1,5 @@ # Megatron-DeepSpeed @ ALCF - ## Polaris ### Install @@ -16,19 +15,16 @@ 2. Create `conda` env: ```bash - # ---- 1. Create conda env ------------------------------------------------- $ module load conda/2023-10-04 #; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06 $ MPICC="cc -shared -taret-accel=nvidia80" $ DAY=$(date "+%Y-^m-%d") $ conda create --solver libmamba -c pytorch -c nvidia --name "${DAY}" "python==3.10" $ export PYTHONUSERBASE="${HOME}/.local/polaris/conda/${DAY}" - # --------------------------------------------------------------------------- ``` 3. Install dependencies: ```bash - # -------- a. Install dependencies ------------------------------------------ $ conda install -c pytorch -c nvidia --solver libmamba mpi4py pytorch-cuda=11.8 ninja torchvision torchaudio pytorch-cuda=11.8 transformers xformers triton $ python3 -m pip install --upgrade pip pybind11 toolong appdirs wandb sentencepiece ipython setuptools wheel ninja $ python3 -m pip install --upgrade deepspeed wandb @@ -37,7 +33,6 @@ 1. Install `apex`: ```bash - # ------------ i. Install `apex` -------------------------------------------- $ git clone https://github.com/NVIDIA/apex $ cd apex $ module swap gcc gcc/10.3.0 @@ -47,10 +42,8 @@ 2. Install `ezpz`: ```bash - # ------------ ii. Install `ezpz` -------------------------------------------- $ git clone https://github.com/saforem2/ezpz $ python3 -m pip install -e "ezpz[dev]" - # --------------------------------------------------------------------------- ``` ### Running @@ -66,19 +59,19 @@ modules and launch [`pretrain_gpt_alcf.py`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/pretrain_gpt_alcf.py) using `deepspeed` -Explicitly, we can: +- Explicitly, to launch: -```bash -# 1. Launch interactive job -$ qsub -A -q debug -l select=2 -l walltime=01:00:00,filesystems=eagle:home -I -# 2. Load conda environment -$ module load conda/2023-10-04 ; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06 -$ export PYTHONUSERBASE=/home/foremans/.local/polaris/conda/2024-03-06 -# 3. Navigate into `Megatron-DeepSpeed` directory -$ cd Megatron-DeepSpeed -# 4. Launch: -$ bash train_llama_alcf_polaris.sh -``` + ```bash + # 1. Launch interactive job + $ qsub -A -q debug -l select=2 -l walltime=01:00:00,filesystems=eagle:home -I + # 2. Load conda environment + $ module load conda/2023-10-04 ; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06 + $ export PYTHONUSERBASE=/home/foremans/.local/polaris/conda/2024-03-06 + # 3. Navigate into `Megatron-DeepSpeed` directory + $ cd Megatron-DeepSpeed + # 4. Launch: + $ bash train_llama_alcf_polaris.sh + ```
[Output] From 4076cf09fb92d6f61b637b2b51234f11e5280b19 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 7 Mar 2024 10:36:48 -0600 Subject: [PATCH 103/268] Update `ALCF/README.md` --- ALCF/README.md | 112 ++++++++++++++++++++++++------------------------- 1 file changed, 54 insertions(+), 58 deletions(-) diff --git a/ALCF/README.md b/ALCF/README.md index 31bf4b759d..47920644db 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -7,7 +7,6 @@ 1. Clone [`argonne-lcf/Megatron-DeepSpeed`](https://github.com/argonne-lcf/Megatron-DeepSpeed) ```bash - [#](#.md) ---- 0. Clone + navigate into `Megatron-DeepSpeed`: $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed $ cd Megatron-DeepSpeed ``` @@ -15,9 +14,9 @@ 2. Create `conda` env: ```bash - $ module load conda/2023-10-04 #; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06 - $ MPICC="cc -shared -taret-accel=nvidia80" - $ DAY=$(date "+%Y-^m-%d") + $ module load conda/2023-10-04 + $ export MPICC="cc -shared -taret-accel=nvidia80" + $ export DAY=$(date "+%Y-^m-%d") $ conda create --solver libmamba -c pytorch -c nvidia --name "${DAY}" "python==3.10" $ export PYTHONUSERBASE="${HOME}/.local/polaris/conda/${DAY}" ``` @@ -30,7 +29,7 @@ $ python3 -m pip install --upgrade deepspeed wandb ``` - 1. Install `apex`: + - [`NVIDIA/apex`](https://github.com/NVIDIA/apex): ```bash $ git clone https://github.com/NVIDIA/apex @@ -39,7 +38,7 @@ $ python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ ``` - 2. Install `ezpz`: + - [`ezpz`](https://github.com/saforem2/ezpz): ```bash $ git clone https://github.com/saforem2/ezpz @@ -72,56 +71,53 @@ modules and launch # 4. Launch: $ bash train_llama_alcf_polaris.sh ``` +
[Output] -
[Output] - -```bash -source-ing /lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/ALCF/helpers_alcf.sh - -CommandNotFoundError: Your shell has not been properly configured to use 'conda deactivate'. -To initialize your shell, run - - $ conda init - -Currently supported shells are: - - bash - - fish - - tcsh - - xonsh - - zsh - - powershell - -See 'conda init --help' for more information and options. - -IMPORTANT: You may need to close and restart your shell after running 'conda init'. - - -Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env -Found ezpz! -/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/ezpz/src/ezpz/__init__.py -Has ezpz installed. Nothing to do. -┌────────────────────────────────────────────────────────────────── -│ [Hosts]: -│ • [host:0] - x3005c0s37b0n0.hsn.cm.polaris.alcf.anl.gov -│ • [host:1] - x3005c0s37b1n0.hsn.cm.polaris.alcf.anl.gov -└────────────────────────────────────────────────────────────────── -┌────────────────────────────────────────────────────────────────── -│ [DIST INFO]: -│ • Loading job env from: /home/foremans/.pbsenv -│ • HOSTFILE: /var/spool/pbs/aux/1777928.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov -│ • NHOSTS: 2 -│ • NGPU_PER_HOST: 4 -│ • NGPUS (NHOSTS x NGPU_PER_HOST): 8 -│ • WORLD_SIZE: 8 -│ • DIST_LAUNCH: mpiexec --verbose --envall -n 8 -ppn 4 --hostfile /var/spool/pbs/aux/1777928.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov -└────────────────────────────────────────────────────────────────── -┌────────────────────────────────────────────────────────────────── -│ [Launch]: -│ • Use: 'launch' (=mpiexec --verbose --envall -n 8 -ppn 4 --hostfile /var/spool/pbs/aux/1777928.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov) -│ to launch job -└────────────────────────────────────────────────────────────────── - -# [...] -``` - -
+ ```bash + source-ing /lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/ALCF/helpers_alcf.sh + + CommandNotFoundError: Your shell has not been properly configured to use 'conda deactivate'. + To initialize your shell, run + + $ conda init + + Currently supported shells are: + - bash + - fish + - tcsh + - xonsh + - zsh + - powershell + + See 'conda init --help' for more information and options. + + IMPORTANT: You may need to close and restart your shell after running 'conda init'. + + + Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env + Found ezpz! + /lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/ezpz/src/ezpz/__init__.py + Has ezpz installed. Nothing to do. + ┌────────────────────────────────────────────────────────────────── + │ [Hosts]: + │ • [host:0] - x3005c0s37b0n0.hsn.cm.polaris.alcf.anl.gov + │ • [host:1] - x3005c0s37b1n0.hsn.cm.polaris.alcf.anl.gov + └────────────────────────────────────────────────────────────────── + ┌────────────────────────────────────────────────────────────────── + │ [DIST INFO]: + │ • Loading job env from: /home/foremans/.pbsenv + │ • HOSTFILE: /var/spool/pbs/aux/1777928.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov + │ • NHOSTS: 2 + │ • NGPU_PER_HOST: 4 + │ • NGPUS (NHOSTS x NGPU_PER_HOST): 8 + │ • WORLD_SIZE: 8 + │ • DIST_LAUNCH: mpiexec --verbose --envall -n 8 -ppn 4 --hostfile /var/spool/pbs/aux/1777928.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov + └────────────────────────────────────────────────────────────────── + ┌────────────────────────────────────────────────────────────────── + │ [Launch]: + │ • Use: 'launch' (=mpiexec --verbose --envall -n 8 -ppn 4 --hostfile /var/spool/pbs/aux/1777928.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov) + │ to launch job + └────────────────────────────────────────────────────────────────── + # [...] + ``` +
From 7e8a1a8ebeacf0e185dd2e53ee99349aae35c6cf Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 7 Mar 2024 17:07:30 -0600 Subject: [PATCH 104/268] Update `train_llama_alcf_polaris.sh` --- train_llama_alcf_polaris.sh | 115 ++++++++++++++++++------------------ 1 file changed, 58 insertions(+), 57 deletions(-) diff --git a/train_llama_alcf_polaris.sh b/train_llama_alcf_polaris.sh index e4b4ef4f43..e657334d23 100644 --- a/train_llama_alcf_polaris.sh +++ b/train_llama_alcf_polaris.sh @@ -35,7 +35,7 @@ setParams || exit # 5. set command line arguments to pass to ` buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} setArgs || exit # 8. specify additional `deepspeed` arguments -setData || exit # 9. specify `DATA_FILE_LIST` for dolma dataset +setData "${DATA_FILE_LIST}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` printJobInfo || exit # 11. print job info # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ @@ -46,64 +46,64 @@ custom_args=" $@" # Assert `./hostfile_deepspeed` exists export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit -# --vocab-file $VOCAB_FILE \ -# --merge-file $MERGE_FILE \ -# --lr-decay-iters 320000 \ -# --lr-warmup-iters 5000 \ -# --lr-decay-iters 10000 \ -# --num-workers 4 \ -# launch python3 ${EXEC} \ -# --data-impl mmap \ -# source ./ezpz/src/ezpz/bin/getjobenv || exit +run_cmd=" + deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} \ + --$DTYPE \ + --num-workers 0 \ + --split 100,0,0 \ + --log-interval 1 \ + --use-flash-attn-v2 \ + --no-bias-gelu-fusion \ + --lr-decay-style cosine \ + --no-bias-dropout-fusion \ + --no-masked-softmax-fusion \ + --tokenizer-type Llama2Tokenizer \ + --no-gradient-accumulation-fusion \ + --accumulate-allreduce-grads-in-fp32 \ + --use-checkpoint-opt_param-scheduler \ + --lr ${LR} \ + --seq-length $SEQ \ + --save ${CKPT_DIR} \ + --load ${CKPT_DIR} \ + --num-layers ${NLAYERS} \ + --hidden-size ${HIDDEN} \ + --train-iters ${TRAIN_ITER} \ + --eval-iters ${EVAL_ITERS} \ + --distributed-backend ${NCCL} \ + --num-attention-heads ${HEADS} \ + --save-interval ${SAVE_INTERVAL} \ + --eval-interval ${EVAL_INTERVAL} \ + --max-position-embeddings ${SEQ} \ + --micro-batch-size ${MICRO_BATCH} \ + --data-file-list ${DATA_FILE_LIST} \ + --tensor-model-parallel-size ${TP} \ + --global-batch-size ${GLOBAL_BATCH} \ + --pipeline-model-parallel-size ${PP} \ + --num-key-value-heads ${NUM_KV_HEAD} \ + --data-cache-path ${DATA_CACHE_PATH} \ + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + --tokenizer-model ${TOKENIZER_MODEL} \ + ${LLAMA_ARGS} \ + $ds_args \ + ${gpt_args[*]} \ + $custom_args \ + |& tee ${OUTPUT_LOG} + " + + # --------------------------------------------------- + # --vocab-file $VOCAB_FILE \ + # --merge-file $MERGE_FILE \ + # --lr-decay-iters 320000 \ + # --lr-warmup-iters 5000 \ + # --lr-decay-iters 10000 \ + # --num-workers 4 \ + # launch python3 ${EXEC} \ + # --data-impl mmap \ + # source ./ezpz/src/ezpz/bin/getjobenv || exit + # --------------------------------------------------- # ${DIST_LAUNCH} ./local_rank.sh python3 ${EXEC} \ # ${DIST_LAUNCH} python3 ${EXEC} \ # deepspeed $launcher ${EXEC} \ -# run_cmd=" -# deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} \ -# |& tee ${OUTPUT_LOG} -# " - # --$DTYPE \ - # --num-workers 0 \ - # --split 100,0,0 \ - # --log-interval 1 \ - # --use-flash-attn-v2 \ - # --no-bias-gelu-fusion \ - # --lr-decay-style cosine \ - # --no-bias-dropout-fusion \ - # --no-masked-softmax-fusion \ - # --tokenizer-type Llama2Tokenizer \ - # --no-gradient-accumulation-fusion \ - # --accumulate-allreduce-grads-in-fp32 \ - # --use-checkpoint-opt_param-scheduler \ - # --lr ${LR} \ - # --seq-length $SEQ \ - # --save ${CKPT_DIR} \ - # --load ${CKPT_DIR} \ - # --num-layers ${NLAYERS} \ - # --hidden-size ${HIDDEN} \ - # --train-iters ${TRAIN_ITER} \ - # --eval-iters ${EVAL_ITERS} \ - # --distributed-backend ${NCCL} \ - # --num-attention-heads ${HEADS} \ - # --save-interval ${SAVE_INTERVAL} \ - # --eval-interval ${EVAL_INTERVAL} \ - # --max-position-embeddings ${SEQ} \ - # --micro-batch-size ${MICRO_BATCH} \ - # --data-file-list ${DATA_FILE_LIST} \ - # --tensor-model-parallel-size ${TP} \ - # --global-batch-size ${GLOBAL_BATCH} \ - # --pipeline-model-parallel-size ${PP} \ - # --num-key-value-heads ${NUM_KV_HEAD} \ - # --data-cache-path ${DATA_CACHE_PATH} \ - # --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ - # --tokenizer-model ${TOKENIZER_MODEL} \ - # $ds_args \ - # ${LLAMA_ARGS} \ - # ${gpt_args[*]} \ - # $custom_args \ - -run_cmd="deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} ${CLI_ARGS} |& tee ${OUTPUT_LOG}" - # >> ${OUTPUT_LOG} 2>&1 & # >> ${OUTPUT_LOG} 2>&1 & # |& tee $OUTPUT_DIR/output.log @@ -113,7 +113,8 @@ echo "All DeepSpeed(s): $(which -a deepspeed)" echo "Using $(which deepspeed)" ds_report -echo "[RUNNING]: ${run_cmd}" +echo "${run_cmd}" + printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" # echo "${OUTPUT_LOG}" From 13adb2e1e8b7b3626580dc39b2c96c43bd6460bb Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 7 Mar 2024 17:08:00 -0600 Subject: [PATCH 105/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 140 ++++++++++++++++++------------------------------ 1 file changed, 51 insertions(+), 89 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 6ff3447f05..ccb7c345ec 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -1,49 +1,5 @@ #!/bin/bash --login -buildCLIargs() { - custom_args=" $@" - export CLI_ARGS=" - --$DTYPE \ - --num-workers 0 \ - --split 100,0,0 \ - --log-interval 1 \ - --use-flash-attn-v2 \ - --no-bias-gelu-fusion \ - --lr-decay-style cosine \ - --no-bias-dropout-fusion \ - --no-masked-softmax-fusion \ - --tokenizer-type Llama2Tokenizer \ - --no-gradient-accumulation-fusion \ - --accumulate-allreduce-grads-in-fp32 \ - --use-checkpoint-opt_param-scheduler \ - --lr ${LR} \ - --save ${CKPT_DIR} \ - --load ${CKPT_DIR} \ - --seq-length ${SEQ} \ - --num-layers ${NLAYERS} \ - --hidden-size ${HIDDEN} \ - --train-iters ${TRAIN_ITER} \ - --eval-iters ${EVAL_ITERS} \ - --distributed-backend ${NCCL} \ - --num-attention-heads ${HEADS} \ - --save-interval ${SAVE_INTERVAL} \ - --eval-interval ${EVAL_INTERVAL} \ - --max-position-embeddings ${SEQ} \ - --micro-batch-size ${MICRO_BATCH} \ - --data-file-list ${DATA_FILE_LIST} \ - --tensor-model-parallel-size ${TP} \ - --global-batch-size ${GLOBAL_BATCH} \ - --pipeline-model-parallel-size ${PP} \ - --num-key-value-heads ${NUM_KV_HEAD} \ - --data-cache-path ${DATA_CACHE_PATH} \ - --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ - --tokenizer-model ${TOKENIZER_MODEL} \ - $ds_args \ - ${LLAMA_ARGS} \ - ${gpt_args[*]} \ - ${custom_args} \ - " -} printJobInfo() { echo "++++++++++++++++++++++++++++++++++++++++++++++++++" @@ -183,7 +139,6 @@ saveDSenv() { setOutput() { # ---- Specify output location -------------------------------- export OUTPUT_PREFIX="ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" - # OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME} OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" export OUTPUT_DIR="${OUTPUT_DIR}" export OUTPUT_LOG="${OUTPUT_DIR}/output.log" @@ -201,20 +156,6 @@ buildDSconfig() { } -# makeDSenv() { -# saveDSenv -# } - - -# makeDSenv() { -# echo "PATH=${PATH}" > .deepspeed_env -# echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> .deepspeed_env -# echo "http_proxy=${http_proxy}" >> .deepspeed_env -# echo "https_proxy=${https_proxy}" >> .deepspeed_env -# echo "CFLAGS=${CFLAGS}" >> .deepspeed_env -# echo "PYTHONUSERBASE=$PYTHONUSERBASE" >> .deepspeed_env -# } - sumWeights() { local file_list=$1 weights=$(cat "${file_list}" | awk '{print $1}' | tr '\n' '\ ,\ ' | sed 's/^/[/g' | sed 's/$/]/g' | tr '\ ' "\,\ ") @@ -231,42 +172,19 @@ sumFiles() { done } -# setupData() { -# cidx=$1 -# echo "Caught DOLMA_CHUNK_IDX: ${cidx} !!" -# dfl="./chunks-reweighted/10/data_file_list_chunk_${cidx}_of_10.txt" -# if [[ -z "${DATA_FILE_LIST}" ]]; then -# DATA_FILE_LIST="${dfl}" -# else -# echo "Caught DATA_FILE_LIST: ${DATA_FILE_LIST} from ENV!!" -# fi -# NDOCS=$(wc -l < "${DATA_FILE_LIST}") && export NDOCS="${NDOCS}" -# WEIGHT_SUM="$(sumWeights "${DATA_FILE_LIST}")" -# export WEIGHT_SUM="${WEIGHT_SUM}" -# export NDOCS="${NDOCS}" -# echo "Using DATA_FILE_LIST: ${DATA_FILE_LIST} with ${NDOCS} documents" -# echo "WEIGHT SUM: ${WEIGHT_SUM}" -# data_file_list_stem=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") -# export DOLMA_CHUNK_IDX="${cidx}" -# export DATA_FILE_LIST_STEM="${data_file_list_stem}" -# export DATA_CACHE_PATH=".cache/${data_file_list_stem}/index-cache" -# mkdir -p "${DATA_CACHE_PATH}" -# } -# - setEnv() { - if [[ $(hostname) == x4* ]]; then + if [[ $(hostname) == x4* ]]; then # ---- [Aurora] ---------------------- SETENV_FILE="${HOME}/anl_24_release_q4/llm.devkit/setenv.sh" if [[ "${SETENV_FILE}" ]]; then # shellcheck source=/home/foremans/anl_24_release_q4/llm.devkit/setenv.sh - source "${HOME}/anl_24_release_q4/llm.devkit/setenv.sh" + source "${HOME}/anl_24_release_q4/llm.devkit/setenv.sh" || exit else echo "Unable to source ${SETENV_FILE}, exiting!" exit fi - elif [[ $(hostname) == x3* ]]; then - # ---- load conda ----------------------------------- + elif [[ $(hostname) == x3* ]]; then # ---- [Polaris] --------------------- + # ---- [load conda] --------------------- module load conda/2023-10-04 ; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06 export PYTHONUSERBASE="${HOME}/.local/polaris/conda/2024-03-06" mkdir -p "${PYTHONUSERBASE}" @@ -276,7 +194,7 @@ setEnv() { # echo "Not using VIRTUAL_ENV" # # sourceFile "${HERE}/venvs/polaris/2023-10-04/bin/activate" || exit # fi - else + else # ------------------------------------- [Unknown] ------------------- echo "Unknown hostname $(hostname)" exit 1 fi @@ -294,8 +212,7 @@ makeHostfiles() { sed -e "s/$/ slots=${GPUS_PER_NODE}/" -i "${hostfile_deepspeed}" } - -setData() { # dfl: abbrv. for DATA_FILE_LIST +setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------------- dfl="${1:-/eagle/datasets/dolma/data_file_list_reweighted.txt}" # dfl_fallback="/eagle/datasets/dolma/data_file_list_reweighted.txt" printf "Calling: \`setData()\` with %s\n" "${dfl}" @@ -319,6 +236,51 @@ setData() { # dfl: abbrv. for DATA_FILE_LIST echo "--------------------" } +buildCLIargs() { # ---- [BROKEN] ------------------------------------------- + custom_args=" $@" + export CLI_ARGS=" + --$DTYPE \ + --num-workers 0 \ + --split 100,0,0 \ + --log-interval 1 \ + --use-flash-attn-v2 \ + --no-bias-gelu-fusion \ + --lr-decay-style cosine \ + --no-bias-dropout-fusion \ + --no-masked-softmax-fusion \ + --tokenizer-type Llama2Tokenizer \ + --no-gradient-accumulation-fusion \ + --accumulate-allreduce-grads-in-fp32 \ + --use-checkpoint-opt_param-scheduler \ + --lr ${LR} \ + --save ${CKPT_DIR} \ + --load ${CKPT_DIR} \ + --seq-length ${SEQ} \ + --num-layers ${NLAYERS} \ + --hidden-size ${HIDDEN} \ + --train-iters ${TRAIN_ITER} \ + --eval-iters ${EVAL_ITERS} \ + --distributed-backend ${NCCL} \ + --num-attention-heads ${HEADS} \ + --save-interval ${SAVE_INTERVAL} \ + --eval-interval ${EVAL_INTERVAL} \ + --max-position-embeddings ${SEQ} \ + --micro-batch-size ${MICRO_BATCH} \ + --data-file-list ${DATA_FILE_LIST} \ + --tensor-model-parallel-size ${TP} \ + --global-batch-size ${GLOBAL_BATCH} \ + --pipeline-model-parallel-size ${PP} \ + --num-key-value-heads ${NUM_KV_HEAD} \ + --data-cache-path ${DATA_CACHE_PATH} \ + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + --tokenizer-model ${TOKENIZER_MODEL} \ + $ds_args \ + ${LLAMA_ARGS} \ + ${gpt_args[*]} \ + ${custom_args} \ + " +} + printBlack() { printf "\e[1;30m%s\e[0m\n" "$@" } From 509e8ec58771c32171447193883224411571df02 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 7 Mar 2024 17:13:27 -0600 Subject: [PATCH 106/268] Update `ALCF/README.md` --- ALCF/README.md | 68 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/ALCF/README.md b/ALCF/README.md index 47920644db..4b67b59ee2 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -2,6 +2,70 @@ ## Polaris +- Unable to save checkpoints with `torch==2.1` + `cuda==11.8`: + - **NEED TO DEBUG / FIX !!** + - Training progresses OK: + + ```bash + [2024-03-07 15:27:02,646] [INFO] [timer.py:260:stop] epoch=0/micro_step=199/global_step=199, RunningAvgSamplesPerSec=58.730622229657506, CurrSamplesPerSec=61.35304005128382, MemAllocated=6.01GB, MaxMemAllocated=19.52GB + iteration 199/ 317892 | consumed samples: 152832 | consumed tokens: 625999872 | elapsed time per iteration (ms): 14287.5 | learning rate: 2.407E-04 | global batch size: 768 | lm loss: 5.905366E+00 | loss scale: 8192.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 53.753 | tokens per gpu per second (tgs): 1146.733 | TFLOPs: 69.85 | + [2024-03-07 15:27:15,063] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=4, lr=[0.000240653265864008, 0.000240653265864008], mom=[(0.9, 0.999), (0.9, 0.999)] + [2024-03-07 15:27:17,188] [INFO] [timer.py:260:stop] epoch=0/micro_step=200/global_step=200, RunningAvgSamplesPerSec=58.730745476291396, CurrSamplesPerSec=58.75503515561452, MemAllocated=6.01GB, MaxMemAllocated=19.52GB + iteration 200/ 317892 | consumed samples: 153600 | consumed tokens: 629145600 | elapsed time per iteration (ms): 14541.4 | learning rate: 2.407E-04 | global batch size: 768 | lm loss: 5.897035E+00 | loss scale: 8192.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 52.815 | tokens per gpu per second (tgs): 1126.713 | TFLOPs: 68.63 | + saving checkpoint at iteration 200 to checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb768_pp1_tp2_fp16 + # ... + ``` + + - Then crashes with: + + ```python + Traceback (most recent call last): + Traceback (most recent call last): + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/pretrain_gpt_alcf.py", line 575, in + model = main() + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/pretrain_gpt_alcf.py", line 554, in main + model = pretrain( + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 226, in pretrain + iteration = train(forward_step_func, + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 1290, in train + save_checkpoint_and_time(iteration, model, optimizer, + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 1151, in save_checkpoint_and_time + save_checkpoint(iteration, model, optimizer, opt_param_scheduler) + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/checkpointing.py", line 259, in save_checkpoint + state_dict[UNIVERSAL_CHECKPOINT_INFO] = _universal_checkpoint_info(model) + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/checkpointing.py", line 783, in _universal_checkpoint_info + info.update(model[0].universal_checkpoint_info()) + File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/model/gpt_model.py", line 203, in universal_checkpoint_info + info[TP_REPLICATED_PARAMETER_PATTERNS] = self._get_tp_replicated_param_patterns() + File "/lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1695, in __getattr__ + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") + AttributeError: 'GPTModel' object has no attribute '_get_tp_replicated_param_patterns' + ``` + + 🤔 + + +- Continue runs on Polaris @ + - [x] 48 Nodes + - [x] 32 Nodes + - [x] 16 Nodes + - [x] 8 Nodes + - [x] 4 Nodes + +- [x] Then, try re-creating ( / fixing) conda with `cuda==12.1` + - 😔, failed. + +- Convergence: + - [ ] Use `bf16` on both systems + - [ ] Will need to track (for each layer): + - [ ] inputs / outputs + - [ ] weights, gradients + - [ ] Start thread in Intel SC23 channel to discuss convergence issues + - [ ] Add hooks to track additional data + +- [ ] Ensure / double check that optimizer settings from `ds_config.json` aren't being overwritten by some defaults in `megatron/arguments.py` + - [ ] specifically, `momentum, beta{1, 2}, etc` + ### Install 1. Clone [`argonne-lcf/Megatron-DeepSpeed`](https://github.com/argonne-lcf/Megatron-DeepSpeed) @@ -16,7 +80,7 @@ ```bash $ module load conda/2023-10-04 $ export MPICC="cc -shared -taret-accel=nvidia80" - $ export DAY=$(date "+%Y-^m-%d") + $ export DAY=$(date "+%Y-%m-%d") $ conda create --solver libmamba -c pytorch -c nvidia --name "${DAY}" "python==3.10" $ export PYTHONUSERBASE="${HOME}/.local/polaris/conda/${DAY}" ``` @@ -65,7 +129,7 @@ modules and launch $ qsub -A -q debug -l select=2 -l walltime=01:00:00,filesystems=eagle:home -I # 2. Load conda environment $ module load conda/2023-10-04 ; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06 - $ export PYTHONUSERBASE=/home/foremans/.local/polaris/conda/2024-03-06 + # $ export PYTHONUSERBASE=/home/foremans/.local/polaris/conda/2024-03-06 # 3. Navigate into `Megatron-DeepSpeed` directory $ cd Megatron-DeepSpeed # 4. Launch: From c272b4d24d8008c79d316f21500fa3a3fab7bdb9 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 7 Mar 2024 17:18:51 -0600 Subject: [PATCH 107/268] Update `ALCF/README.md` --- ALCF/README.md | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/ALCF/README.md b/ALCF/README.md index 4b67b59ee2..76bb90ba5c 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -2,6 +2,8 @@ ## Polaris +
📅 2024-03-07 + - Unable to save checkpoints with `torch==2.1` + `cuda==11.8`: - **NEED TO DEBUG / FIX !!** - Training progresses OK: @@ -44,6 +46,19 @@ 🤔 +- Convergence: + - [ ] Use `bf16` on both systems + - [ ] Will need to track (for each layer): + - [ ] inputs / outputs + - [ ] weights, gradients + - [ ] Start thread in Intel SC23 channel to discuss convergence issues + - [ ] Add hooks to track additional data + +- [ ] Ensure / double check that optimizer settings from `ds_config.json` aren't being overwritten by some defaults in `megatron/arguments.py` + - [ ] specifically, `momentum, beta{1, 2}, etc` + + +
Completed - Continue runs on Polaris @ - [x] 48 Nodes @@ -55,16 +70,9 @@ - [x] Then, try re-creating ( / fixing) conda with `cuda==12.1` - 😔, failed. -- Convergence: - - [ ] Use `bf16` on both systems - - [ ] Will need to track (for each layer): - - [ ] inputs / outputs - - [ ] weights, gradients - - [ ] Start thread in Intel SC23 channel to discuss convergence issues - - [ ] Add hooks to track additional data +
-- [ ] Ensure / double check that optimizer settings from `ds_config.json` aren't being overwritten by some defaults in `megatron/arguments.py` - - [ ] specifically, `momentum, beta{1, 2}, etc` +
### Install From d9fc6e2bc4e607a84e633ea69c3ec8fa3e3401c9 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 7 Mar 2024 18:08:35 -0600 Subject: [PATCH 108/268] Update `ALCF/README.md` --- ALCF/README.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/ALCF/README.md b/ALCF/README.md index 76bb90ba5c..9a58b8b056 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -4,7 +4,7 @@
📅 2024-03-07 -- Unable to save checkpoints with `torch==2.1` + `cuda==11.8`: +- ‼️ Unable to save checkpoints with `torch==2.1` + `cuda==11.8`: - **NEED TO DEBUG / FIX !!** - Training progresses OK: @@ -89,14 +89,20 @@ $ module load conda/2023-10-04 $ export MPICC="cc -shared -taret-accel=nvidia80" $ export DAY=$(date "+%Y-%m-%d") - $ conda create --solver libmamba -c pytorch -c nvidia --name "${DAY}" "python==3.10" $ export PYTHONUSERBASE="${HOME}/.local/polaris/conda/${DAY}" + $ conda create --solver libmamba -c pytorch -c nvidia --name "${DAY}" "python==3.10" ``` + > [!NOTE] + > In the `conda create` command above, + > you can replace `--name "${DAY}"` with + > `--prefix /path/to/your/conda/envs`, if you prefer: + 3. Install dependencies: ```bash - $ conda install -c pytorch -c nvidia --solver libmamba mpi4py pytorch-cuda=11.8 ninja torchvision torchaudio pytorch-cuda=11.8 transformers xformers triton + $ conda activate "${DAY}" # e.g. 2024-03-07 + $ conda install -c pytorch -c nvidia --solver libmamba mpi4py ninja transformers xformers triton pytorch torchvision torchaudio pytorch-cuda=11.8 $ python3 -m pip install --upgrade pip pybind11 toolong appdirs wandb sentencepiece ipython setuptools wheel ninja $ python3 -m pip install --upgrade deepspeed wandb ``` @@ -106,6 +112,7 @@ ```bash $ git clone https://github.com/NVIDIA/apex $ cd apex + # NOTE: need GCC < 11 for APEX ¯\_(ツ)_/¯ ?? $ module swap gcc gcc/10.3.0 $ python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ ``` From e21451e1d7be832dae13ff869e761ece17ec2638 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 8 Mar 2024 15:34:53 -0600 Subject: [PATCH 109/268] Update `train_llama_alcf_polaris.sh` --- train_llama_alcf_polaris.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_llama_alcf_polaris.sh b/train_llama_alcf_polaris.sh index e657334d23..8b3edf41f2 100644 --- a/train_llama_alcf_polaris.sh +++ b/train_llama_alcf_polaris.sh @@ -48,11 +48,11 @@ export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit run_cmd=" deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} \ + --use-flash-attn-v2 \ --$DTYPE \ --num-workers 0 \ --split 100,0,0 \ --log-interval 1 \ - --use-flash-attn-v2 \ --no-bias-gelu-fusion \ --lr-decay-style cosine \ --no-bias-dropout-fusion \ From e3327936be61dd55a953696754b2db27ac24f315 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 8 Mar 2024 15:35:10 -0600 Subject: [PATCH 110/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index ccb7c345ec..1e570ee43b 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -185,9 +185,11 @@ setEnv() { fi elif [[ $(hostname) == x3* ]]; then # ---- [Polaris] --------------------- # ---- [load conda] --------------------- - module load conda/2023-10-04 ; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06 - export PYTHONUSERBASE="${HOME}/.local/polaris/conda/2024-03-06" - mkdir -p "${PYTHONUSERBASE}" + module load conda/2023-10-04; conda activate cu118-pt221 + # module load conda/2023-10-04 ; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/py311-cu118 + # ; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06 + # export PYTHONUSERBASE="${HOME}/.local/polaris/conda/py311-cu118" + # mkdir -p "${PYTHONUSERBASE}" # if [[ "${VIRTUAL_ENV}" ]]; then # echo "Caught VIRTUAL_ENV = ${VIRTUAL_ENV} from environment!!" # else From a57a21f6b2a8abf847f5ef599e1b1edcb5a5e1b5 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 8 Mar 2024 15:36:18 -0600 Subject: [PATCH 111/268] Fix checkpointing issue with `torch=2.2.1` in `megatron/model/gpt_model.py` --- megatron/model/gpt_model.py | 137 ++++++++++++++++++++++++++---------- 1 file changed, 101 insertions(+), 36 deletions(-) diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py index 4545131217..4e1892e887 100644 --- a/megatron/model/gpt_model.py +++ b/megatron/model/gpt_model.py @@ -62,6 +62,21 @@ def post_language_model_processing(lm_output, labels, logit_weights, return loss +def CrossEntropy(output, labels): + labels, loss_mask = labels[0], labels[1] + + args = get_args() + + # [b s] => [s b] + labels = labels.transpose(0, 1).contiguous() + losses = tensor_parallel.vocab_parallel_cross_entropy(output.contiguous().float(), labels) + # [s b] => [b, s] + losses = losses.transpose(0, 1).contiguous() + loss_mask = loss_mask.view(-1) + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + return loss + + class GPTModel(MegatronModule): """GPT-2 Language model.""" @@ -190,6 +205,62 @@ def _get_vocab_param_patterns(self): ] return patterns + @staticmethod + def _get_vocab_param_patterns(): + args = get_args() + if args.untie_embeddings_and_output_weights: + patterns = [ + r"\d+.word_embeddings.weight", + r"\d+.lm_head.weight" + ] + else: + patterns = [ + r"tied_modules.embed.word_embeddings.weight" + ] + return patterns + + @staticmethod + def _get_tp_replicated_param_patterns(): + args = get_args() + patterns = [ + r"\d+.input_layernorm.weight", + r"\d+.post_attention_layernorm.weight", + r"\d+.weight", + ] + if args.add_position_embedding: + patterns.append(r"tied_modules.embed.position_embeddings.weight") + if args.add_bias_linear: + patterns.extend([ + r"\d+.self_attention.dense.bias", + r"\d+.mlp.dense_4h_to_h.bias", + ]) + if args.normalization == 'layernorm': + patterns.extend([ + r"\d+.input_layernorm.bias", + r"\d+.post_attention_layernorm.bias", + r"\d+.bias", + ]) + return patterns + + @staticmethod + def _get_row_parallel_param_patterns(): + return [ + r"\d+.mlp.dense_4h_to_h.weight", + r"\d+.self_attention.dense.weight", + ] + + @staticmethod + def _get_swiglu_col_parallel_param_patterns(): + args = get_args() + if not args.swiglu: + return [] + patterns = [ + r"\d+.mlp.dense_h_to_4h.weight", + ] + if args.add_bias_linear: + patterns.append(r"\d+.mlp.dense_h_to_4h.bias") + return patterns + def universal_checkpoint_info(self): info = dict() args = get_args() @@ -197,7 +268,6 @@ def universal_checkpoint_info(self): if DS_UNIVERSAL_CHECKPOINT_INFO: # Vocabulary parameters (embeddings) that require special handling due to padding. info[VOCABULARY_PARAMETER_PATTERNS] = self._get_vocab_param_patterns() - if args.tensor_model_parallel_size > 1: # Parameter slices that should be averaged not concatenated. info[TP_REPLICATED_PARAMETER_PATTERNS] = self._get_tp_replicated_param_patterns() @@ -206,23 +276,9 @@ def universal_checkpoint_info(self): info[PARAMETER_WITH_ROW_PARALLELISM_PATTERNS] = self._get_row_parallel_param_patterns() return info - -def CrossEntropy(output, labels): - labels, loss_mask = labels[0], labels[1] - - args = get_args() - - # [b s] => [s b] - labels = labels.transpose(0, 1).contiguous() - losses = tensor_parallel.vocab_parallel_cross_entropy(output.contiguous().float(), labels) - # [s b] => [b, s] - losses = losses.transpose(0, 1).contiguous() - loss_mask = loss_mask.view(-1) - loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() - return loss -class GPTModelPipe(PipelineModule,MegatronModule): +class GPTModelPipe(PipelineModule, MegatronModule): """GPT-2 Language model.""" def __init__(self, @@ -236,8 +292,9 @@ def __init__(self, config.init_method = init_method_normal(config.init_method_std) if config.output_layer_init_method is None: - config.output_layer_init_method = scaled_init_method_normal(config.init_method_std, - config.num_layers) + config.output_layer_init_method = scaled_init_method_normal( + config.init_method_std, + config.num_layers) self.specs = [] @@ -253,25 +310,33 @@ def _to_float16(inputs): # Embedding layer if args.untie_embeddings_and_output_weights: - self.specs.append(LayerSpec(EmbeddingPipe, - args.hidden_size, - args.padded_vocab_size, - args.max_position_embeddings, - args.hidden_dropout, - config, - num_tokentypes=num_tokentypes, - embedding_weights_in_fp32=args.embedding_weights_in_fp32,)) + self.specs.append( + LayerSpec( + EmbeddingPipe, + args.hidden_size, + args.padded_vocab_size, + args.max_position_embeddings, + args.hidden_dropout, + config, + num_tokentypes=num_tokentypes, + embedding_weights_in_fp32=args.embedding_weights_in_fp32, + ) + ) else: - self.specs.append(TiedLayerSpec('embed', - EmbeddingPipe, - args.hidden_size, - args.padded_vocab_size, - args.max_position_embeddings, - args.hidden_dropout, - config, - num_tokentypes=num_tokentypes, - embedding_weights_in_fp32=args.embedding_weights_in_fp32, - tied_weight_attr='word_embeddings_weight')) + self.specs.append( + TiedLayerSpec( + 'embed', + EmbeddingPipe, + args.hidden_size, + args.padded_vocab_size, + args.max_position_embeddings, + args.hidden_dropout, + config, + num_tokentypes=num_tokentypes, + embedding_weights_in_fp32=args.embedding_weights_in_fp32, + tied_weight_attr='word_embeddings_weight' + ) + ) for layer_idx in range(args.num_layers): self.specs.append( From fb7eaf7dccab338cdde9b34e9e6c9750755a7b67 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sat, 9 Mar 2024 09:56:39 -0600 Subject: [PATCH 112/268] Update README.md --- ALCF/README.md | 57 +++++++++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/ALCF/README.md b/ALCF/README.md index 9a58b8b056..bddab06a41 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -2,10 +2,36 @@ ## Polaris -
📅 2024-03-07 +
TODOs -- ‼️ Unable to save checkpoints with `torch==2.1` + `cuda==11.8`: - - **NEED TO DEBUG / FIX !!** +- Convergence: + - [ ] Use `bf16` on both systems + - [ ] Will need to track (for each layer): + - [ ] inputs / outputs + - [ ] weights, gradients + - [ ] Start thread in Intel SC23 channel to discuss convergence issues + - [ ] Add hooks to track additional data + +- [ ] Ensure / double check that optimizer settings from `ds_config.json` aren't being overwritten by some defaults in `megatron/arguments.py` + - [ ] specifically, `momentum, beta{1, 2}, etc` + +
Completed + +- Continue runs on Polaris @ + - [x] 48 Nodes + - [x] 32 Nodes + - [x] 16 Nodes + - [x] 8 Nodes + - [x] 4 Nodes + +- [x] Then, try re-creating ( / fixing) conda with `cuda==12.1` + - 😔, failed. + +- ~~‼️ Unable to save checkpoints with `torch==2.1` + `cuda==11.8`~~: + - Fixed in [a57a21f](https://github.com/argonne-lcf/Megatron-DeepSpeed/commit/a57a21f6b2a8abf847f5ef599e1b1edcb5a5e1b5) + +
🐛 Bug + - Training progresses OK: ```bash @@ -45,30 +71,9 @@ ``` 🤔 +
-- Convergence: - - [ ] Use `bf16` on both systems - - [ ] Will need to track (for each layer): - - [ ] inputs / outputs - - [ ] weights, gradients - - [ ] Start thread in Intel SC23 channel to discuss convergence issues - - [ ] Add hooks to track additional data - -- [ ] Ensure / double check that optimizer settings from `ds_config.json` aren't being overwritten by some defaults in `megatron/arguments.py` - - [ ] specifically, `momentum, beta{1, 2}, etc` - - -
Completed - -- Continue runs on Polaris @ - - [x] 48 Nodes - - [x] 32 Nodes - - [x] 16 Nodes - - [x] 8 Nodes - - [x] 4 Nodes - -- [x] Then, try re-creating ( / fixing) conda with `cuda==12.1` - - 😔, failed. +
From 99d12f4c62b507b5e5f0d05cadcf70c82333b28d Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Tue, 12 Mar 2024 00:54:28 -0500 Subject: [PATCH 113/268] added support for multiprocessing_context --- megatron/arguments.py | 1 + megatron/data/data_samplers.py | 2 +- train_llama_alcf_polaris_hzheng.sh | 22 +++++++++------------- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index faeac611bc..a83c369018 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -1307,6 +1307,7 @@ def _add_data_args(parser): help='Force to use certain index file.') group.add_argument('--repeated-dataloader', action='store_true', help='Once all the data has been loaded, reuse the DataLoader.') + group.add_argument('--multiprocessing-context', type=str, default='fork') return parser diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 8eb2f2a668..0aae13abce 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -46,7 +46,7 @@ def build_pretraining_data_loader(dataset, consumed_samples): batch_sampler=batch_sampler, num_workers=args.num_workers, pin_memory=True, - # multiprocessing_context='spawn' + multiprocessing_context=args.multiprocessing_context ) if args.repeated_dataloader: loader=RepeatingLoader(loader) diff --git a/train_llama_alcf_polaris_hzheng.sh b/train_llama_alcf_polaris_hzheng.sh index 0ca7cb78bb..1fb5345eda 100755 --- a/train_llama_alcf_polaris_hzheng.sh +++ b/train_llama_alcf_polaris_hzheng.sh @@ -4,25 +4,21 @@ #PBS -q debug-scaling #PBS -l select=2 #PBS -l filesystems=eagle:grand:home -export PPN=4 -export MD=/home/hzheng/ALCF-Megatron-DeepSpeed -module load conda/2023-10-04 -#conda activate /soft/datascience/megatron-deepspeed/2023-10-04 -conda activate $HOME/PolarisAT/pyenvs/megatron/2023-10-04 cd ${PBS_O_WORKDIR} +export PPN=4 +export MD=/eagle/argonne_tpc/soft/Megatron-DeepSpeed +source /eagle/argonne_tpc/soft/conda.sh export PBS_JOBSIZE=$(cat $PBS_NODEFILE | uniq | wc -l) export TP=1 export PP=1 export MBS=1 export BS=$((MBS*PBS_JOBSIZE*PPN/PP/TP)) export SP=$((PBS_JOBSIZE*PPN/PP/TP)) -#export DATA_PATH="/eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/" - -export export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") -export DATA_PATH="/eagle/datasets//dolma/data_Llama2Tokenizer/wiki-en-simple/" -#export DATA_FILE_LIST="/eagle/datasets//dolma//data_file_list_select.txt" -DATA_FILE_LIST=$PWD/test.txt -echo "BS: $BS\n PP:$PP \n TP: $TP, PBS_JOBSIZE: $PBS_JOBSIZE" +export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") +export DATA_FILE_LIST="/eagle/datasets//dolma/data_file_list_reweighted.txt" +echo "BS: $BS - PP:$PP - TP: $TP, PBS_JOBSIZE: $PBS_JOBSIZE" +# First time running, it will compile the fused kernels, which will take about 10 mins +# >>> done with compiling and loading fused kernels. Compilation time: 545.468 seconds HIDDEN_SIZE=4096 NUM_LAYERS=32 @@ -32,7 +28,7 @@ TRAIN_ITERS=10 ZERO_STAGE=2 MODEL=LLAMA_7B OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} -MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --cpu-bind depth -d 16 --hostfile $PBS_NODEFILE python3 ./pretrain_gpt_alcf.py \ +MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec --pmi=pmix -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --cpu-bind depth -d 16 --hostfile $PBS_NODEFILE python3 ./pretrain_gpt_alcf.py \ --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ --num-layers ${NUM_LAYERS} \ From d91b237ba1460556f96c04e22d766b741d190e7d Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Tue, 12 Mar 2024 01:12:14 -0500 Subject: [PATCH 114/268] changed script to common environement --- ALCF/test_blend_full.sh | 14 +++++++------- ALCF/test_blendable_dataset.py | 26 +++++++++++++++----------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/ALCF/test_blend_full.sh b/ALCF/test_blend_full.sh index 4245304456..0218383980 100755 --- a/ALCF/test_blend_full.sh +++ b/ALCF/test_blend_full.sh @@ -6,16 +6,16 @@ #PBS -l filesystems=eagle:grand:home cd ${PBS_O_WORKDIR} export PPN=4 -export MD=/home/hzheng/ALCF-Megatron-DeepSpeed -module load conda/2023-10-04 -#conda activate /soft/datascience/megatron-deepspeed/2023-10-04 -conda activate $HOME/PolarisAT/pyenvs/megatron/2023-10-04 +export MD=/eagle/argonne_tpc/soft/Megatron-DeepSpeed +source /eagle/argonne_tpc/soft/conda.sh +export TRITON_CACHE_DIR=/tmp/.cache/ + export TP=1 export PP=1 export SP=128 export MBS=1 export BS=$((MBS*SP)) -export export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") +export DATE_TAG=$(date +"%Y-%m-%d-%H-%M-%S") export DATA_FILE_LIST="/eagle/datasets//dolma/data_file_list_reweighted.txt" HIDDEN_SIZE=4096 @@ -25,8 +25,9 @@ EMBEDDINGS=2048 TRAIN_ITERS=80797 ZERO_STAGE=2 MODEL=LLAMA_7B +export PBS_JOBSIZE=$(cat $PBS_NODEFILE | uniq | wc -l) OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} -python3 ALCF_utils/test_blendable_dataset.py \ +mpiexec --pmi=pmix -n $((PBS_JOBSIZE*PPN)) --ppn $PPN --cpu-bind depth -d 16 python3 ALCF/test_blendable_dataset.py \ --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ --num-layers ${NUM_LAYERS} \ @@ -51,7 +52,6 @@ python3 ALCF_utils/test_blendable_dataset.py \ --lr-warmup-iters 2 \ --optimizer adam \ --adam-beta1 0.9 \ - --mmap_warmup False \ --adam-beta2 0.95 \ --log-interval 1 \ --cpu-optimizer \ diff --git a/ALCF/test_blendable_dataset.py b/ALCF/test_blendable_dataset.py index a0cccbb6cb..503f499fab 100644 --- a/ALCF/test_blendable_dataset.py +++ b/ALCF/test_blendable_dataset.py @@ -5,12 +5,14 @@ from megatron.arguments import parse_args from megatron.initialize import initialize_megatron from megatron.data.data_samplers import build_pretraining_data_loader - +from mpi4py import MPI +comm = MPI.COMM_WORLD initialize_megatron(allow_no_cuda=True) args = get_args() data_file_list = args.data_file_list -print(f"Reading data from {args.data_file_list}") +if comm.rank==0: + print(f"Reading data from {args.data_file_list}") files = [] weights = [] flist = [] @@ -28,10 +30,10 @@ num_samples = args.global_batch_size*args.train_iters num_datasets = len(weights) - -print(f"Number of datasets: {num_datasets}") -print(f"Global batch size: {args.global_batch_size}") -print(f"Training iterations: {args.train_iters}") +if comm.rank==0: + print(f"Number of datasets: {num_datasets}") + print(f"Global batch size: {args.global_batch_size}") + print(f"Training iterations: {args.train_iters}") train_valid_test_num_samples = [num_samples, 0, 0] seed=args.seed data_impl = args.data_impl @@ -48,21 +50,23 @@ ratio_select=np.zeros(num_datasets) #for i in range(num_datasets): # ratio_select[i] = np.sum([i==d for d in dataset_idx])/num_samples - -print(f"Total number of samples: {len(train_ds)}") -print(f"Weights set: {weights[:min(8, num_datasets)]}") +if comm.rank ==0: + print(f"Total number of samples: {len(train_ds)}") + print(f"Weights set: {weights[:min(8, num_datasets)]}") #print(f"Weights across training: {ratio_select[:min(8, num_datasets)]}") for e in range(min(100, args.train_iters)): ratio_select=np.zeros(num_datasets) for i in range(num_datasets): ratio_select[i] = np.sum([i==d for d in dataset_idx[e*args.global_batch_size:(e+1)*args.global_batch_size]])/args.global_batch_size - print(f"iter-{e}: {ratio_select[:min(8, num_datasets)]}") + if comm.rank==0: + print(f"iter-{e}: {ratio_select[:min(8, num_datasets)]}") print("First 10 samples") for i in range(10): - print(f"Sample: {i} \t dataset_idx: {train_ds.dataset_index[i]}, sample_idx: {train_ds.dataset_sample_index[i]}") + if comm.rank==0: + print(f"Sample: {i} \t dataset_idx: {train_ds.dataset_index[i]}, sample_idx: {train_ds.dataset_sample_index[i]}") #### Build data loaders From 7df1664b6c8c61a4a786354b5c2a0ed26113eca7 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 12 Mar 2024 12:17:41 -0500 Subject: [PATCH 115/268] Update README.md --- ALCF/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ALCF/README.md b/ALCF/README.md index bddab06a41..252c2289d5 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -148,11 +148,11 @@ modules and launch # 1. Launch interactive job $ qsub -A -q debug -l select=2 -l walltime=01:00:00,filesystems=eagle:home -I # 2. Load conda environment - $ module load conda/2023-10-04 ; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06 - # $ export PYTHONUSERBASE=/home/foremans/.local/polaris/conda/2024-03-06 + $ module load conda/2023-10-04 ; conda activate /eagle/datascience/foremans/miniconda3/envs/cu118-pt221 ; unset PYTHONUSERBASE # 3. Navigate into `Megatron-DeepSpeed` directory $ cd Megatron-DeepSpeed # 4. Launch: + $ export PBS_O_WORKDIR=$(pwd) $ bash train_llama_alcf_polaris.sh ```
[Output] From 9a954444fdd9dd5dbfb185af44061f95417a0851 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 12 Mar 2024 20:15:51 -0500 Subject: [PATCH 116/268] Pull in changes from `microsoft/Megatron-DeepSpeed@df0e2e4` --- megatron/model/gpt_model.py | 349 +++++++++++++++--------------------- 1 file changed, 144 insertions(+), 205 deletions(-) diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py index 4e1892e887..0527765f16 100644 --- a/megatron/model/gpt_model.py +++ b/megatron/model/gpt_model.py @@ -62,19 +62,117 @@ def post_language_model_processing(lm_output, labels, logit_weights, return loss -def CrossEntropy(output, labels): - labels, loss_mask = labels[0], labels[1] +class UniversalCheckpointInfo: + def __init__(self, using_model_pipe: bool): + self.using_model_pipe = using_model_pipe + self.args = get_args() + self.info = self._build_universal_checkpoint_info() - args = get_args() + def get(self): + return self.info - # [b s] => [s b] - labels = labels.transpose(0, 1).contiguous() - losses = tensor_parallel.vocab_parallel_cross_entropy(output.contiguous().float(), labels) - # [s b] => [b, s] - losses = losses.transpose(0, 1).contiguous() - loss_mask = loss_mask.view(-1) - loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() - return loss + def _build_universal_checkpoint_info(self): + info = dict() + if DS_UNIVERSAL_CHECKPOINT_INFO: + # Vocabulary parameters (embeddings) that require special handling due to padding. + info[VOCABULARY_PARAMETER_PATTERNS] = self._get_vocab_param_patterns() + + if self.using_model_pipe: + # Replicated (shared) parameters on the pipeline dimension + info[PIPELINE_REPLICATED_PARAMETER_PATTERNS] = self._get_pp_replicated_param_patterns() + + if self.args.tensor_model_parallel_size > 1: + # Parameter slices that should be averaged not concatenated. + info[TP_REPLICATED_PARAMETER_PATTERNS] = self._get_tp_replicated_param_patterns() + + # Parameter that are sliced on the row dimension + info[PARAMETER_WITH_ROW_PARALLELISM_PATTERNS] = self._get_row_parallel_param_patterns() + + # SWIGLU parameters are first sliced on dim=0 to tp slices + # Then, each tp slice is chunked into 2 to create the linear layers L1, L2 used for silu(L1(x)) * L2(x)) + info[PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0] = self._get_swiglu_col_parallel_param_patterns() + return info + + def _get_vocab_param_patterns(self): + if self.using_model_pipe: + if self.args.untie_embeddings_and_output_weights: + patterns = [ + r"\d+.word_embeddings.weight", + r"\d+.lm_head.weight" + ] + else: + patterns = [ + r"tied_modules.embed.word_embeddings.weight" + ] + else: + patterns = [ + "language_model.embedding.word_embeddings.weight" + ] + if self.args.untie_embeddings_and_output_weights: + patterns.append("language_model.output_layer.weight") + return patterns + + def _get_pp_replicated_param_patterns(self): + if self.args.untie_embeddings_and_output_weights: + return [] + patterns = self._get_vocab_param_patterns() + if self.args.add_position_embedding: + patterns.append(r"tied_modules.embed.position_embeddings.weight") + return patterns + + def _layers_prefix(self): + return "" if self.using_model_pipe else "language_model.encoder.layers." + + def _get_tp_replicated_param_patterns(self): + layers_prefix = self._layers_prefix() + patterns = [ + layers_prefix + r"\d+.input_layernorm.weight", + layers_prefix + r"\d+.post_attention_layernorm.weight", + ] + # Add final normalization layer + final_norm_w_pattern = r"\d+.weight" if self.using_model_pipe \ + else "language_model.encoder.final_layernorm.weight" + patterns.append(final_norm_w_pattern) + if self.args.normalization == 'layernorm': + final_norm_b_pattern = r"\d+.bias" if self.using_model_pipe \ + else "language_model.encoder.final_layernorm.bias" + patterns.append(final_norm_b_pattern) + # add Positional Embedding + if self.args.add_position_embedding: + pos_emb_pattern = "tied_modules.embed.position_embeddings.weight" if self.using_model_pipe \ + else "language_model.embedding.position_embeddings.weight" + patterns.append(pos_emb_pattern) + # add Linear bias + if self.args.add_bias_linear: + patterns.extend([ + layers_prefix + r"\d+.self_attention.dense.bias", + layers_prefix + r"\d+.mlp.dense_4h_to_h.bias", + ]) + # add LN bias + if self.args.normalization == 'layernorm': + patterns.extend([ + layers_prefix + r"\d+.input_layernorm.bias", + layers_prefix + r"\d+.post_attention_layernorm.bias", + ]) + return patterns + + def _get_row_parallel_param_patterns(self): + layers_prefix = self._layers_prefix() + return [ + layers_prefix + r"\d+.mlp.dense_4h_to_h.weight", + layers_prefix + r"\d+.self_attention.dense.weight", + ] + + def _get_swiglu_col_parallel_param_patterns(self): + if not self.args.swiglu: + return [] + layers_prefix = self._layers_prefix() + patterns = [ + layers_prefix + r"\d+.mlp.dense_h_to_4h.weight", + ] + if self.args.add_bias_linear: + patterns.append(layers_prefix + r"\d+.mlp.dense_h_to_4h.bias") + return patterns class GPTModel(MegatronModule): @@ -192,93 +290,26 @@ def load_state_dict(self, state_dict, strict=True): state_dict["moe_state_dict"] = moe_state_dict self.language_model.load_state_dict(state_dict, strict=strict) - def _get_vocab_param_patterns(self): - args = get_args() - if args.untie_embeddings_and_output_weights: - patterns = [ - r"\d+.word_embeddings.weight", - r"\d+.lm_head.weight" - ] - else: - patterns = [ - r"tied_modules.embed.word_embeddings.weight" - ] - return patterns - - @staticmethod - def _get_vocab_param_patterns(): - args = get_args() - if args.untie_embeddings_and_output_weights: - patterns = [ - r"\d+.word_embeddings.weight", - r"\d+.lm_head.weight" - ] - else: - patterns = [ - r"tied_modules.embed.word_embeddings.weight" - ] - return patterns - - @staticmethod - def _get_tp_replicated_param_patterns(): - args = get_args() - patterns = [ - r"\d+.input_layernorm.weight", - r"\d+.post_attention_layernorm.weight", - r"\d+.weight", - ] - if args.add_position_embedding: - patterns.append(r"tied_modules.embed.position_embeddings.weight") - if args.add_bias_linear: - patterns.extend([ - r"\d+.self_attention.dense.bias", - r"\d+.mlp.dense_4h_to_h.bias", - ]) - if args.normalization == 'layernorm': - patterns.extend([ - r"\d+.input_layernorm.bias", - r"\d+.post_attention_layernorm.bias", - r"\d+.bias", - ]) - return patterns - - @staticmethod - def _get_row_parallel_param_patterns(): - return [ - r"\d+.mlp.dense_4h_to_h.weight", - r"\d+.self_attention.dense.weight", - ] - - @staticmethod - def _get_swiglu_col_parallel_param_patterns(): - args = get_args() - if not args.swiglu: - return [] - patterns = [ - r"\d+.mlp.dense_h_to_4h.weight", - ] - if args.add_bias_linear: - patterns.append(r"\d+.mlp.dense_h_to_4h.bias") - return patterns - def universal_checkpoint_info(self): - info = dict() - args = get_args() + return UniversalCheckpointInfo(using_model_pipe=False).get() - if DS_UNIVERSAL_CHECKPOINT_INFO: - # Vocabulary parameters (embeddings) that require special handling due to padding. - info[VOCABULARY_PARAMETER_PATTERNS] = self._get_vocab_param_patterns() - if args.tensor_model_parallel_size > 1: - # Parameter slices that should be averaged not concatenated. - info[TP_REPLICATED_PARAMETER_PATTERNS] = self._get_tp_replicated_param_patterns() - # Parameter that are sliced on the row dimension - info[PARAMETER_WITH_ROW_PARALLELISM_PATTERNS] = self._get_row_parallel_param_patterns() +def CrossEntropy(output, labels): + labels, loss_mask = labels[0], labels[1] - return info + args = get_args() + + # [b s] => [s b] + labels = labels.transpose(0, 1).contiguous() + losses = tensor_parallel.vocab_parallel_cross_entropy(output.contiguous().float(), labels) + # [s b] => [b, s] + losses = losses.transpose(0, 1).contiguous() + loss_mask = loss_mask.view(-1) + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + return loss -class GPTModelPipe(PipelineModule, MegatronModule): +class GPTModelPipe(PipelineModule,MegatronModule): """GPT-2 Language model.""" def __init__(self, @@ -292,9 +323,8 @@ def __init__(self, config.init_method = init_method_normal(config.init_method_std) if config.output_layer_init_method is None: - config.output_layer_init_method = scaled_init_method_normal( - config.init_method_std, - config.num_layers) + config.output_layer_init_method = scaled_init_method_normal(config.init_method_std, + config.num_layers) self.specs = [] @@ -310,33 +340,25 @@ def _to_float16(inputs): # Embedding layer if args.untie_embeddings_and_output_weights: - self.specs.append( - LayerSpec( - EmbeddingPipe, - args.hidden_size, - args.padded_vocab_size, - args.max_position_embeddings, - args.hidden_dropout, - config, - num_tokentypes=num_tokentypes, - embedding_weights_in_fp32=args.embedding_weights_in_fp32, - ) - ) + self.specs.append(LayerSpec(EmbeddingPipe, + args.hidden_size, + args.padded_vocab_size, + args.max_position_embeddings, + args.hidden_dropout, + config, + num_tokentypes=num_tokentypes, + embedding_weights_in_fp32=args.embedding_weights_in_fp32,)) else: - self.specs.append( - TiedLayerSpec( - 'embed', - EmbeddingPipe, - args.hidden_size, - args.padded_vocab_size, - args.max_position_embeddings, - args.hidden_dropout, - config, - num_tokentypes=num_tokentypes, - embedding_weights_in_fp32=args.embedding_weights_in_fp32, - tied_weight_attr='word_embeddings_weight' - ) - ) + self.specs.append(TiedLayerSpec('embed', + EmbeddingPipe, + args.hidden_size, + args.padded_vocab_size, + args.max_position_embeddings, + args.hidden_dropout, + config, + num_tokentypes=num_tokentypes, + embedding_weights_in_fp32=args.embedding_weights_in_fp32, + tied_weight_attr='word_embeddings_weight')) for layer_idx in range(args.num_layers): self.specs.append( @@ -401,88 +423,5 @@ def _logits_helper(embedding, lm_output): activation_checkpoint_interval=interval, partition_method='type:transformer') - @staticmethod - def _get_vocab_param_patterns(): - args = get_args() - if args.untie_embeddings_and_output_weights: - patterns = [ - r"\d+.word_embeddings.weight", - r"\d+.lm_head.weight" - ] - else: - patterns = [ - r"tied_modules.embed.word_embeddings.weight" - ] - return patterns - - def _get_pp_replicated_param_patterns(self): - args = get_args() - if args.untie_embeddings_and_output_weights: - return [] - patterns = self._get_vocab_param_patterns() - if args.add_position_embedding: - patterns.append(r"tied_modules.embed.position_embeddings.weight") - return patterns - - @staticmethod - def _get_tp_replicated_param_patterns(): - args = get_args() - patterns = [ - r"\d+.input_layernorm.weight", - r"\d+.post_attention_layernorm.weight", - r"\d+.weight", - ] - if args.add_position_embedding: - patterns.append(r"tied_modules.embed.position_embeddings.weight") - if args.add_bias_linear: - patterns.extend([ - r"\d+.self_attention.dense.bias", - r"\d+.mlp.dense_4h_to_h.bias", - ]) - if args.normalization == 'layernorm': - patterns.extend([ - r"\d+.input_layernorm.bias", - r"\d+.post_attention_layernorm.bias", - r"\d+.bias", - ]) - return patterns - - @staticmethod - def _get_row_parallel_param_patterns(): - return [ - r"\d+.mlp.dense_4h_to_h.weight", - r"\d+.self_attention.dense.weight", - ] - - @staticmethod - def _get_swiglu_col_parallel_param_patterns(): - args = get_args() - if not args.swiglu: - return [] - patterns = [ - r"\d+.mlp.dense_h_to_4h.weight", - ] - if args.add_bias_linear: - patterns.append(r"\d+.mlp.dense_h_to_4h.bias") - return patterns - - def universal_checkpoint_info(self): - info = dict() - if DS_UNIVERSAL_CHECKPOINT_INFO: - # Vocabulary parameters (embeddings) that require special handling due to padding. - info[VOCABULARY_PARAMETER_PATTERNS] = self._get_vocab_param_patterns() - - # Replicated (shared) parameters on the pipeline dimension - info[PIPELINE_REPLICATED_PARAMETER_PATTERNS] = self._get_pp_replicated_param_patterns() - - # Parameter slices that should be averaged not concatenated. - info[TP_REPLICATED_PARAMETER_PATTERNS] = self._get_tp_replicated_param_patterns() - - # Parameter that are sliced on the row dimension - info[PARAMETER_WITH_ROW_PARALLELISM_PATTERNS] = self._get_row_parallel_param_patterns() - - # SWIGLU parameters are first sliced on dim=0 to tp slices - # Then, each tp slice is chunked into 2 to create the linear layers L1, L2 used for silu(L1(x)) * L2(x)) - info[PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0] = self._get_swiglu_col_parallel_param_patterns() - return info + return UniversalCheckpointInfo(using_model_pipe=True).get() From 79c3067a125df5f8dc03558b5771acf55dd4fb7e Mon Sep 17 00:00:00 2001 From: Venkat Vishwanath Date: Wed, 13 Mar 2024 10:50:21 -0400 Subject: [PATCH 117/268] Update README.md --- ALCF/README.md | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/ALCF/README.md b/ALCF/README.md index 252c2289d5..49b583aa24 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -2,16 +2,6 @@ ## Polaris -
TODOs - -- Convergence: - - [ ] Use `bf16` on both systems - - [ ] Will need to track (for each layer): - - [ ] inputs / outputs - - [ ] weights, gradients - - [ ] Start thread in Intel SC23 channel to discuss convergence issues - - [ ] Add hooks to track additional data - - [ ] Ensure / double check that optimizer settings from `ds_config.json` aren't being overwritten by some defaults in `megatron/arguments.py` - [ ] specifically, `momentum, beta{1, 2}, etc` From 15d422e306d97c6aab6cd0e4e42383a2039604b5 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Thu, 14 Mar 2024 19:20:00 -0500 Subject: [PATCH 118/268] fixing some NCCL issue and updated the script with the common environment --- ALCF/test_blend_full.sh | 4 ++-- ALCF/test_blendable_dataset.py | 11 +++++++---- megatron/data/gpt_dataset.py | 6 ++++-- pretrain_gpt_alcf.py | 3 ++- train_llama_alcf_polaris_hzheng.sh | 7 +++++-- 5 files changed, 20 insertions(+), 11 deletions(-) diff --git a/ALCF/test_blend_full.sh b/ALCF/test_blend_full.sh index 0218383980..459652a2ee 100755 --- a/ALCF/test_blend_full.sh +++ b/ALCF/test_blend_full.sh @@ -27,7 +27,7 @@ ZERO_STAGE=2 MODEL=LLAMA_7B export PBS_JOBSIZE=$(cat $PBS_NODEFILE | uniq | wc -l) OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} -mpiexec --pmi=pmix -n $((PBS_JOBSIZE*PPN)) --ppn $PPN --cpu-bind depth -d 16 python3 ALCF/test_blendable_dataset.py \ +APRUN_PMI=pmix aprun -n $((PBS_JOBSIZE*PPN)) -N $PPN --cc depth -d 16 ${MD}/local_rank.sh python3 ALCF/test_blendable_dataset.py \ --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ --num-layers ${NUM_LAYERS} \ @@ -70,4 +70,4 @@ mpiexec --pmi=pmix -n $((PBS_JOBSIZE*PPN)) --ppn $PPN --cpu-bind depth -d 16 py --data-path ${DATA_PATH} \ --data-cache-path /tmp/hzheng-megatron-deepspeed-cache/ \ --vocab-file ${MD}/dataset/gpt2-vocab.json --merge-file ${MD}/dataset/gpt2-merges.txt \ - --zero-stage=${ZERO_STAGE} --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed + --zero-stage=${ZERO_STAGE} --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed diff --git a/ALCF/test_blendable_dataset.py b/ALCF/test_blendable_dataset.py index 503f499fab..9681198251 100644 --- a/ALCF/test_blendable_dataset.py +++ b/ALCF/test_blendable_dataset.py @@ -6,6 +6,7 @@ from megatron.initialize import initialize_megatron from megatron.data.data_samplers import build_pretraining_data_loader from mpi4py import MPI +from megatron.core import mpu comm = MPI.COMM_WORLD initialize_megatron(allow_no_cuda=True) args = get_args() @@ -68,10 +69,12 @@ if comm.rank==0: print(f"Sample: {i} \t dataset_idx: {train_ds.dataset_index[i]}, sample_idx: {train_ds.dataset_sample_index[i]}") - #### Build data loaders -train_dataloader = build_pretraining_data_loader( +rank_in_parallel_group = mpu.get_sequence_parallel_rank() +print(rank_in_parallel_group) +if rank_in_parallel_group == 0: + train_dataloader = build_pretraining_data_loader( train_ds, args.consumed_train_samples) -valid_dataloader = build_pretraining_data_loader( + valid_dataloader = build_pretraining_data_loader( valid_ds, args.consumed_valid_samples) -test_dataloader = build_pretraining_data_loader(test_ds, 0) + test_dataloader = build_pretraining_data_loader(test_ds, 0) diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py index 210a92c85e..84ccff9a8b 100644 --- a/megatron/data/gpt_dataset.py +++ b/megatron/data/gpt_dataset.py @@ -504,8 +504,10 @@ def _build_index_mappings(name, data_prefix, documents, sizes, data_cache_success = False counts = get_accelerator().LongTensor([data_cache_success]) - torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) - torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) + if mpu.get_data_parallel_world_size() > 1: + torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) + if mpu.get_pipeline_model_parallel_world_size() > 1: + torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) if counts[0].item() != ( torch.distributed.get_world_size() // torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()) // diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 4fefef795f..fae3309d5c 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -466,6 +466,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): 'for GPT ...') files = [] if args.data_file_list is not None: + print_rank_0(f"Reading datasets from {args.data_file_list}") with open(args.data_file_list, 'r') as flist: for f in flist.readlines(): w, fname = f.split() @@ -479,7 +480,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): files.append(path + f.split(".bin")[0]) else: files = args.data_path - print_rank_0(f"file list {files}") + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( data_prefix=files, data_impl=args.data_impl, diff --git a/train_llama_alcf_polaris_hzheng.sh b/train_llama_alcf_polaris_hzheng.sh index 1fb5345eda..83d8a2c5a7 100755 --- a/train_llama_alcf_polaris_hzheng.sh +++ b/train_llama_alcf_polaris_hzheng.sh @@ -8,6 +8,7 @@ cd ${PBS_O_WORKDIR} export PPN=4 export MD=/eagle/argonne_tpc/soft/Megatron-DeepSpeed source /eagle/argonne_tpc/soft/conda.sh + export PBS_JOBSIZE=$(cat $PBS_NODEFILE | uniq | wc -l) export TP=1 export PP=1 @@ -27,8 +28,9 @@ EMBEDDINGS=2048 TRAIN_ITERS=10 ZERO_STAGE=2 MODEL=LLAMA_7B +#LAUNCHER="//eagle/argonne_tpc/soft/Megatron-DeepSpeed/..//conda/2024-03-11/lib/python3.10/site-packages/deepspeed/launcher/launcher_helper.py --launcher mpich " OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_mp${MP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} -MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec --pmi=pmix -n $((PBS_JOBSIZE*PPN)) -ppn $PPN --cpu-bind depth -d 16 --hostfile $PBS_NODEFILE python3 ./pretrain_gpt_alcf.py \ +APRUN_PMI=pmix aprun -n $((PBS_JOBSIZE*PPN)) -N $PPN --cc depth -d 16 /eagle/argonne_tpc/soft/Megatron-DeepSpeed/local_rank.sh python3 $LAUNCHER ./pretrain_gpt_alcf.py \ --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ --num-layers ${NUM_LAYERS} \ @@ -70,4 +72,5 @@ MASTER_ADDR=localhost MASTER_PORT=6543 mpiexec --pmi=pmix -n $((PBS_JOBSIZE*PPN) --data-file-list ${DATA_FILE_LIST} \ --data-path ${DATA_PATH} \ --vocab-file ${MD}/dataset/gpt2-vocab.json --merge-file ${MD}/dataset/gpt2-merges.txt \ - --zero-stage=${ZERO_STAGE} --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed + --zero-stage=${ZERO_STAGE} --deepspeed_config=${MD}/ds_config-gpt.json --deepspeed \ + --data-cache-path ./data_cache_path/ From 73dc82a532d4d5b55cdcaf151887f7e867aeab96 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Thu, 14 Mar 2024 21:35:53 -0500 Subject: [PATCH 119/268] further allreduce check --- megatron/data/blendable_dataset.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/megatron/data/blendable_dataset.py b/megatron/data/blendable_dataset.py index f3276c6823..0220fa22e7 100644 --- a/megatron/data/blendable_dataset.py +++ b/megatron/data/blendable_dataset.py @@ -80,8 +80,10 @@ def _build_indices(): counts = get_accelerator().LongTensor([cache_success]) - torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) - torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) + if mpu.get_data_parallel_world_size() > 1: + torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) + if mpu.get_pipeline_model_parallel_world_size() > 1: + torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) if counts[0].item() != ( torch.distributed.get_world_size() // torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()) // From 4fe87d0f71708404a437684593442607558e4bb4 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Thu, 14 Mar 2024 23:41:38 -0500 Subject: [PATCH 120/268] Update gpt_dataset.py --- megatron/data/gpt_dataset.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py index 84ccff9a8b..210a92c85e 100644 --- a/megatron/data/gpt_dataset.py +++ b/megatron/data/gpt_dataset.py @@ -504,10 +504,8 @@ def _build_index_mappings(name, data_prefix, documents, sizes, data_cache_success = False counts = get_accelerator().LongTensor([data_cache_success]) - if mpu.get_data_parallel_world_size() > 1: - torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) - if mpu.get_pipeline_model_parallel_world_size() > 1: - torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) + torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) + torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) if counts[0].item() != ( torch.distributed.get_world_size() // torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()) // From 39be39565aceb16dbb8b2e04d41b45d183cfee71 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Thu, 14 Mar 2024 23:47:58 -0500 Subject: [PATCH 121/268] Update blendable_dataset.py --- megatron/data/blendable_dataset.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/megatron/data/blendable_dataset.py b/megatron/data/blendable_dataset.py index 0220fa22e7..f3276c6823 100644 --- a/megatron/data/blendable_dataset.py +++ b/megatron/data/blendable_dataset.py @@ -80,10 +80,8 @@ def _build_indices(): counts = get_accelerator().LongTensor([cache_success]) - if mpu.get_data_parallel_world_size() > 1: - torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) - if mpu.get_pipeline_model_parallel_world_size() > 1: - torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) + torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) + torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) if counts[0].item() != ( torch.distributed.get_world_size() // torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()) // From b0c21335e23f16c3dcb0b666cec6b01376e66cc8 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 15 Mar 2024 14:51:41 -0500 Subject: [PATCH 122/268] Update `ALCF/helpers.sh` for SunSpot --- ALCF/helpers.sh | 181 ++++++++++++++++++++++++++++-------------------- 1 file changed, 105 insertions(+), 76 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 1e570ee43b..ccd24209c4 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -31,21 +31,23 @@ function setDSlauncher() { } setParams() { - # [Parallelism Settings] {{{ - if [[ $(hostname) == x4* ]]; then # ---- [AURORA] ---- - TP=${TP:-1} # • TP = 1 - PP=${PP:-1} # • PP = 1 - export CCL=${CCL:-ccl} # • CCL - export BE="${CCL}" # • BE = CCL - export DTYPE=${DTYPE:-bf16} # • DTYPE: bf16 - elif [[ $(hostname) == x3* ]]; then # ---- [POLARIS] ---- - PP=${PP:-1} # • PP = 1 - TP=${TP:-2} # • TP = 2 - export NCCL=${NCCL:-nccl} # • NCCL - export BE="${NCCL}" # • BE = NCCL - export DTYPE=${DTYPE:-fp16} # • DTYPE: FP16 + # ---- [Parallelism Settings] -------------------------------------------- + # -------- [Aurora] ---- || ----- [SunSpot] ------------ + if [[ $(hostname) == x4* || $(hostname) == x1* ]]; then + TP=${TP:-1} # TP = 1 + PP=${PP:-1} # PP = 1 + export CCL=${CCL:-ccl} # CCL + export BE="${CCL}" # BE = CCL + export DTYPE=${DTYPE:-bf16} # DTYPE: bf16 + # -------- [Polaris] ----------------------------------- + elif [[ $(hostname) == x3* ]]; then + TP=${TP:-2} # TP = 2 + PP=${PP:-1} # PP = 1 + export NCCL=${NCCL:-nccl} # NCCL + export BE="${NCCL}" # BE = NCCL + export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 fi - # }}} + # ------------------------------------------------------------------------ export PP="${PP}" export TP="${TP}" export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" @@ -69,8 +71,10 @@ setParams() { export SAVE_INTERVAL=${SAVE_INTERVAL:-200} export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} # export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} - export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) - export TOKENIZER_MODEL="${TOKENIZER_MODEL:-"/eagle/datasets/dolma/utils/tokenizer.model"}" + # export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) + export GLOBAL_BATCH_MAX=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) + export GLOBAL_BATCH="${GLOBAL_BATCH:-${GLOBAL_BATCH_MAX}}" + export TOKENIZER_MODEL="${TOKENIZER_MODEL:-"/home/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/tokenizer.model"}" export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" # ---------------------------------------------------- @@ -120,8 +124,9 @@ ezpz() { echo "Using $(which python3) to install \`ezpz\`:" python3 -m pip install -e ezpz > ezpz-install.log 2>&1 fi - source ezpz/src/ezpz/bin/savejobenv > /tmp/savejobenv.log 2>&1 || exit - source ezpz/src/ezpz/bin/getjobenv || exit + echo "Done with ezpz." + # source ezpz/src/ezpz/bin/savejobenv || exit # > /tmp/savejobenv.log 2>&1 || exit + # source ezpz/src/ezpz/bin/getjobenv || exit } saveDSenv() { @@ -151,7 +156,10 @@ setOutput() { buildDSconfig() { # ---- Build DeepSpeed Config --------------------------------- export DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" - bash "${HERE}/generate_config.sh" "${DS_CONFIG}" || exit 1 + echo "DS_CONFIG: ${DS_CONFIG}" + printf "ZS: %s, MB: %s, GB: %s, PP: %s, DTYPE: %s" ${ZERO_STAGE} ${MICRO_BATCH} ${GLOBAL_BATCH} ${PP} ${DTYPE} + # generateConfig "${DS_CONFIG}" + bash "${PBS_O_WORKDIR}/generate_config.sh" "${DS_CONFIG}" #|| exit 1 # ------------------------------------------------------------- } @@ -174,18 +182,24 @@ sumFiles() { setEnv() { - if [[ $(hostname) == x4* ]]; then # ---- [Aurora] ---------------------- - SETENV_FILE="${HOME}/anl_24_release_q4/llm.devkit/setenv.sh" - if [[ "${SETENV_FILE}" ]]; then - # shellcheck source=/home/foremans/anl_24_release_q4/llm.devkit/setenv.sh - source "${HOME}/anl_24_release_q4/llm.devkit/setenv.sh" || exit - else - echo "Unable to source ${SETENV_FILE}, exiting!" - exit + # ---- [SunSpot] ------- || ---- [Aurora] -------------- + if [[ $(hostname) == x1* || $(hostname) == x4* ]]; then + PBS_PARENT=$(dirname ${PBS_O_WORKDIR}) + echo "Sourcing ${PBS_PARENT}/setenv.sh..." + source "${PBS_PARENT}/setenv.sh" || exit + # ----- [Aurora] ----------------------------------- + if [[ $(hostname) == x4* ]]; then + eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate anl_release_q4v2 + # ----- [SunSpot] ---------------------------------- + elif [[ $(hostname) == x1* ]]; then + echo "Running on SunSpot !!" + eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate q4-drop fi - elif [[ $(hostname) == x3* ]]; then # ---- [Polaris] --------------------- + # ----- [Polaris] --------------------------------------- + elif [[ $(hostname) == x3* ]]; then + echo "Running on Polaris !!" # ---- [load conda] --------------------- - module load conda/2023-10-04; conda activate cu118-pt221 + module load conda/2023-10-04; conda activate cu118-pt221 ; unset PYTHONUSERBASE # module load conda/2023-10-04 ; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/py311-cu118 # ; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06 # export PYTHONUSERBASE="${HOME}/.local/polaris/conda/py311-cu118" @@ -203,8 +217,13 @@ setEnv() { } makeHostfiles() { - GPUS_PER_NODE=$(python3 -Wignore -c 'import ezpz; print(ezpz.get_gpus_per_node())') - export GPUS_PER_NODE="${GPUS_PER_NODE}" + # GPUS_PER_NODE=$(python3 -Wignore -c 'import ezpz; print(ezpz.get_gpus_per_node())') + # source $(python3 -c 'import ezpz; print(ezpz.SAVEJOBENV.as_posix())') || exit + # source $(python3 -c 'import ezpz; print(ezpz.GETJOBENV.as_posix())') || exit + source ezpz/src/ezpz/bin/savejobenv || exit #> /tmp/savejobenv.log 2>&1 & + source ezpz/src/ezpz/bin/getjobenv || exit + export GPUS_PER_NODE="${NGPU_PER_HOST}" + # export GPUS_PER_NODE="${GPUS_PER_NODE}" # ---- Make MPICH hostfile ---------------- export hostfile_mpich=hostfile_mpich cat "$PBS_NODEFILE" > "${hostfile_mpich}" @@ -215,7 +234,16 @@ makeHostfiles() { } setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------------- - dfl="${1:-/eagle/datasets/dolma/data_file_list_reweighted.txt}" + if [[ $(hostname) == x4* ]]; then # ---- [AURORA] ---- + dfl_fallback="/home/foremans/anl_24_release_q4/llm.devkit/Megatron-DeepSpeed/data_file_list_reweighted.txt" + elif [[ $(hostname) == x1* ]]; then + dfl_fallback="/gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_file_list_reweighted.txt" + elif [[ $(hostname) == x3* ]]; then + dfl_fallback="/eagle/datasets/dolma/data_file_list_reweighted.txt" + else + echo "Unknown hostname. Must manually specify DATA_FILE_LIST." + fi + dfl="${1:-${dfl_fallback}}" # dfl_fallback="/eagle/datasets/dolma/data_file_list_reweighted.txt" printf "Calling: \`setData()\` with %s\n" "${dfl}" ndocs=$(wc -l < "${dfl}") @@ -238,50 +266,51 @@ setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------------- echo "--------------------" } -buildCLIargs() { # ---- [BROKEN] ------------------------------------------- - custom_args=" $@" - export CLI_ARGS=" - --$DTYPE \ - --num-workers 0 \ - --split 100,0,0 \ - --log-interval 1 \ - --use-flash-attn-v2 \ - --no-bias-gelu-fusion \ - --lr-decay-style cosine \ - --no-bias-dropout-fusion \ - --no-masked-softmax-fusion \ - --tokenizer-type Llama2Tokenizer \ - --no-gradient-accumulation-fusion \ - --accumulate-allreduce-grads-in-fp32 \ - --use-checkpoint-opt_param-scheduler \ - --lr ${LR} \ - --save ${CKPT_DIR} \ - --load ${CKPT_DIR} \ - --seq-length ${SEQ} \ - --num-layers ${NLAYERS} \ - --hidden-size ${HIDDEN} \ - --train-iters ${TRAIN_ITER} \ - --eval-iters ${EVAL_ITERS} \ - --distributed-backend ${NCCL} \ - --num-attention-heads ${HEADS} \ - --save-interval ${SAVE_INTERVAL} \ - --eval-interval ${EVAL_INTERVAL} \ - --max-position-embeddings ${SEQ} \ - --micro-batch-size ${MICRO_BATCH} \ - --data-file-list ${DATA_FILE_LIST} \ - --tensor-model-parallel-size ${TP} \ - --global-batch-size ${GLOBAL_BATCH} \ - --pipeline-model-parallel-size ${PP} \ - --num-key-value-heads ${NUM_KV_HEAD} \ - --data-cache-path ${DATA_CACHE_PATH} \ - --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ - --tokenizer-model ${TOKENIZER_MODEL} \ - $ds_args \ - ${LLAMA_ARGS} \ - ${gpt_args[*]} \ - ${custom_args} \ - " -} +# buildCLIargs() { # ---- [BROKEN] ------------------------------------------- +# custom_args=" $@" +# export CLI_ARGS=" +# --$DTYPE \ +# --num-workers 0 \ +# --split 100,0,0 \ +# --log-interval 1 \ +# --use-flash-attn-v2 \ +# --no-bias-gelu-fusion \ +# --lr-decay-style cosine \ +# --no-bias-dropout-fusion \ +# --no-masked-softmax-fusion \ +# --tokenizer-type Llama2Tokenizer \ +# --no-gradient-accumulation-fusion \ +# --accumulate-allreduce-grads-in-fp32 \ +# --use-checkpoint-opt_param-scheduler \ +# --lr ${LR} \ +# --save ${CKPT_DIR} \ +# --load ${CKPT_DIR} \ +# --seq-length ${SEQ} \ +# --num-layers ${NLAYERS} \ +# --hidden-size ${HIDDEN} \ +# --train-iters ${TRAIN_ITER} \ +# --eval-iters ${EVAL_ITERS} \ +# --distributed-backend ${BE} \ +# --num-attention-heads ${HEADS} \ +# --save-interval ${SAVE_INTERVAL} \ +# --eval-interval ${EVAL_INTERVAL} \ +# --max-position-embeddings ${SEQ} \ +# --micro-batch-size ${MICRO_BATCH} \ +# --data-file-list ${DATA_FILE_LIST} \ +# --tensor-model-parallel-size ${TP} \ +# --global-batch-size ${GLOBAL_BATCH} \ +# --pipeline-model-parallel-size ${PP} \ +# --num-key-value-heads ${NUM_KV_HEAD} \ +# --data-cache-path ${DATA_CACHE_PATH} \ +# --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ +# --tokenizer-model ${TOKENIZER_MODEL} \ +# $ds_args \ +# ${LLAMA_ARGS} \ +# ${gpt_args[*]} \ +# ${custom_args} \ +# " +# } + printBlack() { printf "\e[1;30m%s\e[0m\n" "$@" From e27f381da714402ed417dd4a5e0339682f8f1be8 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 15 Mar 2024 17:58:54 -0500 Subject: [PATCH 123/268] Add `train_llama_alcf_sunspot.sh` --- train_llama_alcf_sunspot.sh | 167 ++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 train_llama_alcf_sunspot.sh diff --git a/train_llama_alcf_sunspot.sh b/train_llama_alcf_sunspot.sh new file mode 100644 index 0000000000..ad45ee8ba2 --- /dev/null +++ b/train_llama_alcf_sunspot.sh @@ -0,0 +1,167 @@ +#!/bin/bash --login +#PBS -l walltime=06:00:00 +#PBS -A argonne_tpc +#PBS -q prod +#PBS -l select=48 +#PBS -l filesystems=eagle:home + +function sourceFile() { + fp="$1" + echo "source-ing ${fp}" + if [[ -f "${fp}" ]]; then + # shellcheck source="${fp}" + source "${fp}" + else + echo "ERROR: UNABLE TO SOURCE ${fp}" + fi +} + +module () { + if [ -z "${LMOD_SH_DBG_ON+x}" ] + then + case "$-" in + (*v*x*) __lmod_sh_dbg='vx' ;; + (*v*) __lmod_sh_dbg='v' ;; + (*x*) __lmod_sh_dbg='x' ;; + esac + fi + if [ -n "${__lmod_sh_dbg:-}" ] + then + set +$__lmod_sh_dbg + echo "Shell debugging temporarily silenced: export LMOD_SH_DBG_ON=1 for Lmod's output" >&2 + fi + eval "$($LMOD_CMD $LMOD_SHELL_PRGM "$@")" && eval "$(${LMOD_SETTARG_CMD:-:} -s sh)" + __lmod_my_status=$? + if [ -n "${__lmod_sh_dbg:-}" ] + then + echo "Shell debugging restarted" >&2 + set -$__lmod_sh_dbg + fi + unset __lmod_sh_dbg + return $__lmod_my_status +} + +# +# eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" +# conda activate q4-drop + + + +# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +# ---- 0. Navigate into `$PBS_O_WORKDIR` ------------------------------------- +cd "${PBS_O_WORKDIR}" || exit +HERE=$(python3 -c 'import os; print(os.getcwd())') +export HERE + +# PARENT="$(dirname "${HERE}")" +# source "${PARENT}/setenv.sh" || exit +# ---- 1. Assert `./pretrain_gpt_alcf.py` exists: ----------------------------- +export EXEC="${HERE}/pretrain_gpt_alcf.py" +[ -f "${EXEC}" ] || exit +# ---- 2. `source ./ALCF/helpers_alcf.sh`: ------------------------------------ +sourceFile "${HERE}/ALCF/helpers.sh" || exit +# ---- 3. Call fns from `./ALCF/helpers_alcf.sh` ------------------------------ +setEnv || exit # 1. load `conda` environment +saveDSenv || exit # 2. save env vars to `.deepspeed_env` +ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars +makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` +setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` +buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ +setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} +setArgs || exit # 8. specify additional `deepspeed` arguments +setData "${DATA_FILE_LIST}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset +setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` +printJobInfo || exit # 11. print job info +# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +# Take custom args +custom_args=" $@" + +# Assert `./hostfile_deepspeed` exists +export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit + +# hf="${HOSTFILE:-${PBS_NODEFILE}}" +# nh=$(wc -l "${hf}") +# if [[ "${nh}" -gt 1 ]]; then +# launch_cmd="deepspeed --hostfile $hfds --launcher MPICH ${EXEC}" +# else +# launch_cmd="python3 ${EXEC}" +# fi +# +# echo "launch_cmd: ${launch_cmd}" + + # --use-flash-attn-v2 \ + # python3 ${EXEC} \ +run_cmd=" + deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ + --$DTYPE \ + --num-workers 0 \ + --split 100,0,0 \ + --log-interval 1 \ + --no-bias-gelu-fusion \ + --lr-decay-style cosine \ + --no-bias-dropout-fusion \ + --no-masked-softmax-fusion \ + --tokenizer-type Llama2Tokenizer \ + --no-gradient-accumulation-fusion \ + --accumulate-allreduce-grads-in-fp32 \ + --use-checkpoint-opt_param-scheduler \ + --lr ${LR} \ + --seq-length $SEQ \ + --save ${CKPT_DIR} \ + --load ${CKPT_DIR} \ + --num-layers ${NLAYERS} \ + --hidden-size ${HIDDEN} \ + --train-iters ${TRAIN_ITER} \ + --eval-iters ${EVAL_ITERS} \ + --distributed-backend ${BE} \ + --num-attention-heads ${HEADS} \ + --save-interval ${SAVE_INTERVAL} \ + --eval-interval ${EVAL_INTERVAL} \ + --max-position-embeddings ${SEQ} \ + --micro-batch-size ${MICRO_BATCH} \ + --data-file-list ${DATA_FILE_LIST} \ + --tensor-model-parallel-size ${TP} \ + --global-batch-size ${GLOBAL_BATCH} \ + --pipeline-model-parallel-size ${PP} \ + --num-key-value-heads ${NUM_KV_HEAD} \ + --data-cache-path ${DATA_CACHE_PATH} \ + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + --tokenizer-model ${TOKENIZER_MODEL} \ + ${LLAMA_ARGS} \ + $ds_args \ + ${gpt_args[*]} \ + $custom_args \ + |& tee ${OUTPUT_LOG} + " + + # --------------------------------------------------- + # --vocab-file $VOCAB_FILE \ + # --merge-file $MERGE_FILE \ + # --lr-decay-iters 320000 \ + # --lr-warmup-iters 5000 \ + # --lr-decay-iters 10000 \ + # --num-workers 4 \ + # launch python3 ${EXEC} \ + # --data-impl mmap \ + # source ./ezpz/src/ezpz/bin/getjobenv || exit + # --------------------------------------------------- + # ${DIST_LAUNCH} ./local_rank.sh python3 ${EXEC} \ + # ${DIST_LAUNCH} python3 ${EXEC} \ + # deepspeed $launcher ${EXEC} \ + # >> ${OUTPUT_LOG} 2>&1 & + # >> ${OUTPUT_LOG} 2>&1 & + # |& tee $OUTPUT_DIR/output.log + # ${EXTRA_ARGS} \ + +echo "All DeepSpeed(s): $(which -a deepspeed)" +echo "Using $(which deepspeed)" +ds_report + +echo "${run_cmd}" + +printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" +printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" +# echo "${OUTPUT_LOG}" +eval "${run_cmd}" +set +x From 7bfef8f559c42cb2d9e1b523ca864278ad293de0 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sat, 16 Mar 2024 02:05:39 -0500 Subject: [PATCH 124/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index ccd24209c4..5cefe4829d 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -39,6 +39,7 @@ setParams() { export CCL=${CCL:-ccl} # CCL export BE="${CCL}" # BE = CCL export DTYPE=${DTYPE:-bf16} # DTYPE: bf16 + MICRO_BATCH=${MICRO_BATCH:-4} # MICRO_BATCH = 4 # -------- [Polaris] ----------------------------------- elif [[ $(hostname) == x3* ]]; then TP=${TP:-2} # TP = 2 @@ -46,6 +47,7 @@ setParams() { export NCCL=${NCCL:-nccl} # NCCL export BE="${NCCL}" # BE = NCCL export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 + MICRO_BATCH=${MICRO_BATCH:-8} # MICRO_BATCH = 8 fi # ------------------------------------------------------------------------ export PP="${PP}" @@ -74,7 +76,9 @@ setParams() { # export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) export GLOBAL_BATCH_MAX=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) export GLOBAL_BATCH="${GLOBAL_BATCH:-${GLOBAL_BATCH_MAX}}" - export TOKENIZER_MODEL="${TOKENIZER_MODEL:-"/home/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/tokenizer.model"}" + tm_a=/home/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/tokenizer.model + tm_p="/eagle/datasets/dolma/utils/tokenizer.model" + export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm_p:-${tm_a}}}" export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" # ---------------------------------------------------- @@ -222,14 +226,14 @@ makeHostfiles() { # source $(python3 -c 'import ezpz; print(ezpz.GETJOBENV.as_posix())') || exit source ezpz/src/ezpz/bin/savejobenv || exit #> /tmp/savejobenv.log 2>&1 & source ezpz/src/ezpz/bin/getjobenv || exit - export GPUS_PER_NODE="${NGPU_PER_HOST}" - # export GPUS_PER_NODE="${GPUS_PER_NODE}" + export GPUS_PER_NODE="${GPUS_PER_NODE:-${NGPU_PER_HOST}}" # ---- Make MPICH hostfile ---------------- + hf="${HOSTFILE:-${PBS_NODEFILE}}" export hostfile_mpich=hostfile_mpich - cat "$PBS_NODEFILE" > "${hostfile_mpich}" + cat "${hf}" > "${hostfile_mpich}" # ---- Make DeepSpeed hostfile ------------------- export hostfile_deepspeed=hostfile_deepspeed - cat "$PBS_NODEFILE" > "${hostfile_deepspeed}" + cat "${hf}" > "${hostfile_deepspeed}" sed -e "s/$/ slots=${GPUS_PER_NODE}/" -i "${hostfile_deepspeed}" } From 9c6b8932d24ecaf42c9a9d9c339ac7034a29383c Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sat, 16 Mar 2024 09:39:50 -0500 Subject: [PATCH 125/268] Update `train_llama_alcf_sunspot.sh` --- train_llama_alcf_sunspot.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/train_llama_alcf_sunspot.sh b/train_llama_alcf_sunspot.sh index ad45ee8ba2..d4cec63e33 100644 --- a/train_llama_alcf_sunspot.sh +++ b/train_llama_alcf_sunspot.sh @@ -95,6 +95,7 @@ export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit run_cmd=" deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ --$DTYPE \ + --cpu-optimizer \ --num-workers 0 \ --split 100,0,0 \ --log-interval 1 \ From 41a23f16f5741c5cef37254486be5b27f1d91513 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sat, 16 Mar 2024 09:40:10 -0500 Subject: [PATCH 126/268] Add `ALCF/tokenizer.model` --- ALCF/helpers.sh | 8 +++++--- ALCF/tokenizer.model | Bin 0 -> 499723 bytes 2 files changed, 5 insertions(+), 3 deletions(-) create mode 100644 ALCF/tokenizer.model diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 5cefe4829d..c328f0ec63 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -76,9 +76,11 @@ setParams() { # export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) export GLOBAL_BATCH_MAX=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) export GLOBAL_BATCH="${GLOBAL_BATCH:-${GLOBAL_BATCH_MAX}}" - tm_a=/home/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/tokenizer.model - tm_p="/eagle/datasets/dolma/utils/tokenizer.model" - export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm_p:-${tm_a}}}" + tm="${PBS_O_WORKDIR}/ALCF/tokenizer.model" + # tm_a=/home/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/tokenizer.model + # tm_p="/eagle/datasets/dolma/utils/tokenizer.model" + # export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm_p:-${tm_a}}}" + export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" # ---------------------------------------------------- diff --git a/ALCF/tokenizer.model b/ALCF/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..22bccbcb41ec929cf0c9dbe8f41036db82e5e773 GIT binary patch literal 499723 zcma%^36x~lS>GSn7-+l{i^1$=Sy&@mmP^t|8X3ucJu^Mhh#5^;J+d&i>{s1gJyTM5 zSE-j7jck!8gN*?L#b(T21_Ksg(w5}~OSYfYh7dwXH6er$LMjd+gb>mpgb+eVe*f?P z-W#9CIZ67=ne+Yc_APgL_kMS&x#!M(&aFEe54`gE34bs6?73&pJ>%A`5vR`KCSQoA#J*+GDw8ycgJ&rZ)aja>NV@-P;Yue*j(;ml~_BhtG$FZh8jyLUb zylIc)O?w<~+T(cB9><&ZINr3!@uoeFH|=q}X^-PgdmL}t<9O2^$D8&z-n7T@racy# z_E>1zW1(q}g{D0gn)X;|+GC+d~)uK=bmkD{{J(1Dn>&FRp%sby8`AMho-R!vb@hU$OKYkgzAAETNS?McE5nsl5|wnQ0y6G3 zk!VYaI#sFLQs8hzEUU~TRfdy*TO)i^Nz$k1zIN^fpIOnLNxRb3pv3f!s_>yqcq$56 zQB`!!M*^I!_#6U!V}$o)v)5PQ#{gMXP~={zq85odUjfzLS_S3-qYAtpuw4PMf9UD5 zx%9$jZ?59b5|@VvLy7;ED*OWATO)iR(cb1HlK$-$SeE~eAa@k{jw+)n`MV>$uZ&|S zQM>%UAP36$16Asm0N&|vWqenVo9c`|Qt2NJ|Hmp6J>n-K*@IGl668?r`=_cxPXpfL zW)v0vvsJ_;BHkMrwq%o^5AuS@Ukq|tktry@N1>Br0FEh`j352J#g4thJrr%y%Rb@zc^jl4tkl3%OJJP!Ev2p_1G z{YDt4x9#5cn^jG(#sD*|GN}rFs{*G0zg;D~6vM#pR3=`M>c1PsugPM+7ouGV(SARI zd+O|e5ahNJ&#J5gHHA-CC_4WiMzYNT5q}iqj#T;MN{}l6e^O<*s9yM|l~~J>|IezF zcMx|_rHp$0^C)j1#r~p7wE+0b3Y-A^RU}$|0P?S^+&$Ik-$XVF$#Usagw#V9Sk zKWi_Btw{0Nd*#@3HxbWpM9A>WpxvoHXK(JgbDtYSat!m&+AD{zo?67`Rz+M1_`C|d z2=MvAbEqaU7vzq5&KFck?gczM!q3aR&x!D5H=E~HiEbz6d6p%rq!o03OUC}fD$!kl zFLFfmsxOXGNBSOoNiC~#JikKGPrh`o96Lra9DZ4)=EXtS<^?WA4dKfxPBQmbl$=!H zS2~U4|EeIb{(8u-4suOn?APqged*luK66}!_R#9j-11OHXl88b(LOgig|HWz?%S9RZ;Q%KNM)W>{3_PaSxI5C6-#%_0k}B)E%y= z3cDZhGA9v}Yn?;_zr0FvC!iBz4dTOdT~+>yOmTgc>@$@cYC!t)aP)=Gqz0d6y=80KGDnfYvt zX0NK?#!zykRUsQHa^4!Qsqr7JVq#iyEch;KOgJ8iHlxxOoFOuwh$Jt6IZ190a;R2% zN66Jb7xRIWdvjknH%Bv}%FDs>&b_&t=Q@p6^#6ozW=!&y%92k7oBc=k-vZCmAyqGW z(p@3S;9-K#1bOwfkX;L?;p5enp3_f&?ygXb#EX@&F2=rn`V|s$yOPcwNYIPId75ipiEfmiI+OPnLXxi;YBYjLgHG zH8trh(nKBfoLsGKDaeOZ$;*~B29TAY2T`g0Nb)kZnAM8w^l{L+3N2|=Js-)|q}sqT z#2|8i$grP>HV;I^QhagNoGALqy6I?98&#=i_0bqc%+8tw996b>1n^)5?g4C8dEy(i zRYj>fw_QMt={r?7Ip2%P!G(xh8)(vcF*0HGj{qL3YIHfiD3?ODZB^mJk;P-yn#?J zHk39wwAU|T!f!F3WyL_VJ{p{h&&s!!!YBgT3#Y4)@~xGUJym%Qba9>+R%nY_$lq4w zyOn(3Rz=ODYhI)MM~QoTFzanI*5sxK(Vo7&$|ED@Al49`27E`*!%doKZhEk-=JcH| zDZ-(@eC~_>Zg;QyH&Gh4@^@4!EybYrU18E8)tj5VO|z2ku4EZW77()Rgp}VC^hnJ! zH{FZ*#kiD_k6&KWvF7<+%ecXZFo(vp(~nL+4vX(|L8`JG+8?@=zCM`WZ<$mSB&>&< z`u_bu&=-R>v=)^5)c#!X}&wr4&PN}=}Eg9 z#XR$eBiD|qz~o4h{zxz!Xx5RNb{qUrYx*&1ngft(HG1@qSq_N=N$?K0MieIK!B>+Fj(?4m3-?o&b&q!F;4Qcb(~BE zf>F#$ey)n>$+RFu+u76z@!o1AV2ZB#02B9$_$)qVjiXaSTZ&!`AU_{vud+7CO%7zO zUx-BgEl+B4piB@`sb8!lh=v71+ckgk-d9Cw#LGd?oBKR$b@9BXF259|3{^^A(xKDe zuOu2pLjtJ6&fDjU!@Auf{0jEjXrK!HIL4r30~G<{DCNE7%evk zm_>YBo~Zb5s*ZCY%J$<^^TEh#x%zdKU>-H^{5yJFe(WF<#m5`w5I zYF%UDQ^EOKrX)Ee;UiBEKNjHwjU|v)H%71I<5l}&kX3PDw{v~nJ`sSSiU~#zK>Dqa zzHr2o(INn-8_n*MrG_lV^O8QNR{V4%c_Xvz9Fk&2G3@RIJ=9bqhi2)tvr)~sKUHbD z!bBkl$=aJz@>e2eP&Q%^+@n$bs-^ni^kd`zkmHCFLEG;y0kS%Z_R%O2yxv=}I zi35;pGd|}3u*x-kf*25n>2E|+`1Q(a7l)fNYK_au*gvwpnP%mX{H2i; zh9&~OFDBw2S2D%4Jg?zSjN-o;*;b-!Lbv8VpKS9R<)HRYYGDSdgdB_xgNI>~OoPe( zTcQR5M689YWWQ4Gm_FMeo5c3|Etep-L~eQ*^QC|4@C~U9O=laG>!aTeKGP?IkHNMF zuzwbLc4AP6rmp(xxW5zRj)r!uqv_39 zEB(jFMa$)bn*(gYt?18x8gYo95(lu)W%>zyG?M>RSs~^lAe8HC)&6Hy&)Mv|ZRUhv z%U-x`|8vEnx1l-%+Lk9(We0n6Uo`gxFkGz%sGKlLev|yH|0T$)t|1>JMeaSOdCQ-> z_?X;5%lx|z7$tLH)Izf}^Rk68lH624kfmA|Zt zWW}Lj2P}S1#(d?!S4pQICJIE3hlTp&vhKPtjR?IZL?je=J45{3fq5hKPQ{Pn&%n-r_=PZaMWNcmSxZQRN2Ir z<%Umd`04!n(HDq}8j1=?$rLl=XI7bdD@-s_AW;LY;`5T0)1ol#&)w&QaLkx6V+tU{ zT1=1}aUp!p1>oAbt7!25KmR;Kp&Cj2O_chq5LOG^91;eb`s5U*`x-!!&ke!DVU2(( z5cPT*b|A?SV_gA_xptPk@cz7r>FMi}LfepAYV0+-ml6sPvb>Bx7=3<}yrn!QV=#n- zN$;GD15I)U8qIj-N*=0kXb(D{&VK+Vj_Rwi7iF@T3w%LjiAqeN-9|S0G2;VU%)JS@ zoN^Ei_St1MoI~Q&fVqRlI)K@(^yy0lP-BC9iq8p7F{q}{=5+DAyhl0MoaQla0V2~y zntSrxAlKEVOy2r(l0!QA-E`2RMzI=tURCs9L%lKuQ`nNmBa=PcB?Y9#d{HN=XK0HG8B$ki*3X&3>!?J(4-*+kYzi@L(?(V7iFR+V{HGTN;F0}%1*(E z7T*p?s&V?b3M&j~j_8_vag}K$+zlz%x@!@YgDB#@L*#-izcS&$`I3-ms2Nxa?L7CW zRXc{RLMCYPYq5|#KQdga>30h44AXa$!7-O)&Z3Ta`WMLarIBYK??DPpQ3JkJxr1Zn zR6x6^{cj<|msQ$a>_ry_lPLx=NK81?cdvkUrdWo&phBA3q#$=z1kC|tiUm{w?M5}+ zh~>nWSJ}2$v!x(as$z0zieN`bJqTdCdB5r_LZYi#e5WQ4;Y7+IooHR#90icVdKa`9 z{K}xWwb_$GdwRQd-cyU55H#uAzgvJTVmV*@stWhk~7%u7swC3tP@ zc71hl-TQ#JLOa(3pxiNsv_Vlo+g5Kz(66bO;Z0Dh0AqU%u{oq?2(R9$*4G7L=s1We z{MS}lnfGA86l{*iDJTb#awz{x0bRDj*pF>ja5cUQK^%lY5K+BteQk1s)!1z;0Oq)?B9kj3;o30TBmg)ihUFX#;~lRr3Xn`0tL?9=#67RiN(!{f z>NfzJby1z5PPrU=6)%oRX8I!yYalbce@Fc?2ipoGZM7CanQJ5b<5vY3tjYnL0+401 zuZn|4qet*F7mzM*+a8reHi(=GDKv#mKeqp!W7I|eDpNMOs^h@eGkf^rVk2xuT5_Tkg0qqh_sZqVmIpCT?Izmh09Aup~wC7m> zDd9pah_1EOp3s*$H5kSwbPjN)6}cq~ASL44i5=9JSLL51E(JO1R;;>#*yWN~L6#ME zJ{ISl3S(WhV({0^ea-)U0_3TW8N@WIAeZ4_TPD6P^y$Y(1R7JZ*wg$B;DK&b&Tk4r zY?tXzlJWYgoE3d$QXr9hLBePtw9o;Q0@7O8G|3GS^|Duz2hyU@(BKG!{YFooy24($VJW93n29~Sk}0GqwD8!eVS7Pro(8R1C&bZ>%#O-Omc6k zS~Lrv7(oe8$9O6nNdR&UH`U^x(U-MZ7VaiBq>a3_t=U%LWUB+pOZ`T)2F85vW6lQJ6WI73c=i(1l(x2R!1wgGPq zo{zBDNKIZredmzQv!(s$0%(O}rhS# zEG-f3Cz;imXaU4rc3t;8W z$MyhFttO`Fb<}1w<-2MIvA2|ho%3|K-GJ!0pKBpd7+e^{BJ@nyf-O|dkWvsF!cUV! zyRw+#M^ka+W_X*rVa@ZZ0Z5hoZLtOg*tGM~3}h+#u|fFii0rS&?tE%Q-G@a$PTGsH zABZ4)WiZLzL2t$A1MMcsq&ED~IU&xts1HQ}M4C~klPp$oo3dXDv~Y_w#vD|({Na!p zQ~@SuZz*OFXFpXQ5Wx}T(IO?tj4sHjlpEr79qJ=d0O`fc6r9`>@{U4YXa%E2?%>`E zbm8DD=pxB$f-RdU`f>nJySL6LoI`6|xQ67A2##wby6C-?AU!#6Q=mkO0JY05CoL$ z;dK9|poelhrKXp|tpRPDZ^u?faT%WvPu=UPcWdn!+YN zy!iS^u-VftbqY}Xk&`5cF1!1WkriZBy*+TB69zZaF>T7Bohj-IRMx$ELh^qLoJ8?wwjaB{=8r)K#GwI8igV1a>rq6`|^O{}wWK|dJdBK$ufIS}3 zb{M3qeZ8E%7LL4v<#W;tIhcvTtfa%&0c}0#?}!L=8O(;g!M+p>&fG;llRYhtAd#2v zcB~$Z*j0`JWI0$Ji`H&B=C3(4THWPIqa*B%U8!VPk_;&jlf^nKhqhf7=wSs2WqWd9 z^(#Z1vlBTbfVh*$nL|30CPxKGz4%$Zp@0r$^ofRe-IT*g2Jo_Fp0@VJO=|sr`5y2Xgh}3|^ zkS1^dQiTh+fOcCLE_r`DS+k8bERxnBsp*t z$M6i2rg8RZ6gf&O8sPn7SjhqAvOudZKvvqP86K>(u=6SfqXnCZm^BADr-r!#1h<`a zT5+-&F_p6dM437s7+*a{B4n&W9JyHfOp>jN|C;;?DG=SXqmV~n2b z;%Mobf{>KXOEGyJwMtassDby1c5FQwQwm9zfp?Aps587s_5!*b!TmA1P~mc21}5Vs zUuSsLk+cJFQAX~BH|Aoc-YEQ2K-Exe(my{GgQg0@;r?#bq;o_0J6Xe94##@hSu z8LVqaO##jnBYy5UCVzaciaKtn^Fy4yC{!KPkFL((=lvK%5Zrv z3TV%MN5k-$%?Ln7i3W0f0|2m)+<%>^@%tF!cT)tD6q`b@w`7IP>D zpaQ5)pz(YL4Z~9lF#ML#tRE}>Hh?tj!(qiTB79q!!B*^~<_3Hz#H9o)+PEzMl)FO@ znwh*GJ0#y)3Az;ymp0gHoK&rY;JNo0mdQ-7?x=&;xmLo^lRDRuwkaK}6D)D2WCo@I z&~PwZ^41XJeoAaZ5{?ltK?<=1{kHws%M4Pcnm^mRE ze$wo1!ASQ*+K{Lb2hLMyD|oMza-s+Hm3N{|`93Dl>IdxlqmCokF2HioZc$z?e4Fj_ zA(FMBvG1yS7^IJx_a7>~Vi`eJk1x5udf> z?RD!rKhXAOn*f-f)yx#qnZxs!0oAnhp~;~w#ECetGy%I{T&b}*D+sZ5G#41L}7#22~WN&QfU#>HYsmq z3Q0LDI^B{%mm}TgFb*&gV3yM#hYg6*&TZ=KVo^t(kqJSz#JO-?e0St~a3}T=+aOr1 zh3CZdP=|UlXlII#LJqX8@!4RBNgcfsu0f!byasM2FVG&7@2N5|iXuoGjEsB-2eLvfD}3XxDJ0L-y7ikwzl8f2F#D? zyD7j|hrMQt;V&c5da#a?-Z=r*L*vk-Mhl<}t3Y z`;I^tjB8uTv&-?CmV3Q4oNkdiV%uCXdl~lOK%YFNO=V z4JiBA4^XEdm|-E9Ioa|+YUKu_HTiKSP8BDr3eyAi>lw6rR9B-_^3LF{A)*b5MmQEQ zR|>EeTRYlX%pA!Aj*&Qbqz)U!GHg;OUWr*v0hT$m!!R?=6dz_kRPo-HFR%@S;c*|A zFrZs#)C9OCOg~Yc$0x+{tc3?{Qm5GJGe^|tfe?(url3v^%s`aQj7H<nxR{i`ErTREzA{)qyP&%@ zC2^wD^KgK)kjG@JnbR?cR=LSfRu;M#m%7?um!cC`sZoz>9aL9VB4((Fnj7IwP>oGM z$oDW!w1BkHeO7D+Hh;W_aGZq9JNm%1O<#(B1dWN#hO;&Ul(X4t<3A^;?87nroH(x! z>eR{Ei1es8BZ%TOO2hNrq1A&saeb=|a^Val9M)DRCdlI|7e>^IGUwofc}BJcVQ0P# zXli=zeYBba+WkexEz1x6%n2?;!~CgA==GR#w85B3$9_^s%G}XbbOvn`Eo)fKfv$ut z+Q9^5#XXo6&e)UZX$Fm!YyG%>^3zecPAjz`oo9Pfd(5e$4qXW^Nci^Ee)BrndaT4I zj@x@TaRszxx=UMxGoU*NYbD>xc#jK`U%3s99JE!&n-t)j(Rs2mE)JpQ&U7{`=?HB6 zD%d$w%wzcO&43tcRpb88RJw3D0>ibzSch#MHQ*G4ImBXQtrP~ zHx!FHlEt8gEHF~*%A~w($n`-fIoCK) zNkc3^sI{$884_W9Fgf|Ts+19{EfTf~a4FGIQXm`$tNH{$yLIUNMD7?|)$pn_t;
6chr$CNJq|oX{wwE3<`zH)AeR zCebV-d8|@pU9F=HMDO9I_SjQ^Ta`|PN5>y<)gWEmr8q#Gg2}|!nHHTn-G~`L4r%Fc7aOO-X`@vS!1cbVHOb7GmgM64#gOKr zEYpT|s*74Iq&19VDH)LB%kvz09eG^m+$TV{f+bFA=&MQq2EXE3xH;omxi9&?N}J_q zyKOMq+#kmwQvkME(r}eIf|H(V9FFK|6Jr8R(Qo9KUjgY|w5{PV}=C@^;kU57`zM z;00-GZm5V92&0~QR8~iE%otukl^7$>#OdCs%N2#`yER{&LAw>K#}qx;bu0LU=0a`2 zb#pIa%=`cR=d)X`F-A>&c{9o|gORUWf-&>mXHef{Mq?}>k(HwseBWjOOkM4^yg$&g z&q1|8E^f%25dFb6y+y1~kLeCT%rrt=k;|jfXK3=@dnc6;)X!%?88otP4dQ-o4uTn{7btyFMs3@r<4k*xQE4;N{W#9{fso;L zRV}2t0{g6FPmNG3CY&Ja8OsDMo}KYptT%zQS}S@5p#ah*;w|%3SM>AW9mZ_APhFO5-aM2iy8tP6IYs&57tG-GOZ0(O?a|f zM|#621Eaz~`ymXAInc8ndBvk;P6$F?P7kPl3Xttl?F!8ZL8!fVW8maN73(GTm#{3D zxChiAQfMlD{%(|ngelt|aS1sG+9D@WY~o1lKZqM2z_Q_uRLl&FHbk@3NIqOe-WzuZ z+F*ESBPoT12Rq*}Fl0cPPTrp!+DhutmzcFq2(kzd^Eob@_=4UQnL)#E)prd)Qt|VN zg*KRMy`@+xq;)LY27I?Na5+5TUewAwC&ax}8w->^aRF$f=mKQjUI=4=8<)=*{c^+& zwqyAMa5h`bOj@W zq`aPLCWppa{_ac+d}ge0hO?+YgLK6TB~Jy9`!yMbBsU8anGaAhjBgc?Oy$wbypH^I z&L?OB#GqHm%~{995`5-tROV4Ek$fyf9Y%dY+e+)P8I*!ypk)QzGf>4HnAeHD-kyNb zIZnP7bsY0i<_j}0T5x}V7mgpVxLk;Hne7^%8|pZ53RdFx$4mo&8pqDUuujK$2nK-( z7=}ygZbco(j-nV|#&MTA`9#&mx5^W}Z9>RB(8?o)c28p=;`vfWkWIlpsk8+nl>Md? z0CpLy#d{b92oZZUvl*nD>w5XbM-eGG@2*vF8{}v@o*HdRfKB=u!3?NQun0esk1%>n z>O_5Q4;Da~CtPYX09y3p@X#kKk+%lA1KtKJ;Zs_trvQpRi5)XYm{xlsxs$PCrm;@| zR@+zg3NYm_Ytw25?Qw!*tL}eKSK{-E4-A3MJPHS23Pynaw?;+QX^+J*uo_11od6N< zr7uN4NSCr6*92x@vyFyMk4p9`wh!oLM;izme_~Sr1>JWodP1s(ZL!ond4q} z6-@}zEbxvX@C57vH{~WSfS$#&vMui_%m`DH`9$>7Up06#TwMlyq%^AnVB~9QoSDNp z6rwK{K>cDEhukMdT1lc#0W$Y(J)bgj54*Q91#ZQ-kU}K|v5=QIlOYW00D3oQn9hh|&2hL~MgB;+=6s4M@UaxU!*b z=l(Us0pfCiuy62In0NGOEOPB#MBE1~z{*y`%if9AK z#a>c)=8Q_;VdjLWa1Fd11vrb06HwWwxAnGBQOD{^(hO7$pPMvmcus>So*pR(ux!!r zQpf2m7*z_O4$oVqOr(vIsESqewmh!6`Puut`xzs`t)7)XAB<;riPl-u~OvsM`Qrl_f2m z6Nm`wdMYo2t~wWo!n6&SJNiPjRnw#MBrTwAa8^jsAZCPEv@TjF|17xApMoKz>MhP^ zrw)HT4v!4d$_>_I;*vXIKh94~4A={z6%_zCTxJQhq!}2Y;~-e_JHeOkq%)#z5X|^w zX;cTOGS#{>NAx#Vh{&CXZZWTKmy<0Xy$eoGhJW`7*f<$p`REP*JPa3Gwv^EZs_<== zgsJH@-8IOdv3vM>Og=Dx2c%7_wVl0Y1XW6{t5wz*Cyjdi??yqVWkN`J#k?zp#?IyY z4;cu3hrYw+D6XT$q!LsPZoUj#I4yo|#Ge6Z25Q_otd#tVO1{m0wZ}maOnVxcQ)oAu zSoUT>*Yy&EW)3QcZTtipr6+_S)yX(@QvfJuM-Ggc=?iiuC%+frT*^d2Xv@kltQSjD z0^q>qik>5dK}=g`=9kz5l<8&yKb;9odj_?ch_vWd(0q5fr+L0A6!I`7jah~jRe zL8g$_YWmaEBM7n2$6-Tg^kKdtu8v^tMsqRU*JO9*L@d3u+WP&DD;Spq3wGbS-9 zpd752Jjh>|)v4oBeGardACjbEIG96 zFYcvfK*V8?{uluz_UGgFDcCj1 zT`ry8$_Su5&8>4t46L(-*kt0IbV3~WvZXQG>E6_zfh}`4_QC#jRhJhzv4nbHGb%4n zrUbb9uDlYqHG29=%#IMCp^bSVVn~HT5t|ZXi%RKK$%CeN%C(l*Zk1u z+S>%d=1l2$O@VIGc=r)B1FNI(d60#TLM9l4DEMJD7)UeKff4n8$1Oh$(@#ecgA1{) zZiCEVupv9AHD(Q`@y)4GO-FBY5L)y48iW4CQ5?LBjVhxd*M6m>6LQpo5hMAxw*Dr= zKpWcCw)bARfn4eC_&|!M+t8qMrz~f_lRDq@UFhe0F~R?3xB;o&t$1Gg52K7xyaUt* zSi5CAtia~CJ)|jQKnpWZkIrj!ySCv#r3rl=sHpSuS`s{gE?PT~DEaR~hH$R5P4B%I z%csz0Nu|NkQA=v$IW!fv2|0m85*B9R6E`aT7c-DKt@q;|(jSF}tRL}|wG9TI?E^g2 zgoB#sj1bDbIEd4=IS7Vd{~Q>W`Qkof0kE#R2(0R>12{Yv&-niP(5}CM=55nUny04F zl(?X#>EP4yF69m$(%dGH*8gJc!vU?#Y8?ET)!5ra$sbo;;+~E2q(GY>m$0m9OA16K z8cCc;0wKU7hhS>*0aad^#-H*6M6>F-Cpd}*9nWR_hlpYgMOC!{mW|ajR!G4#t4%#! zl9^tM2hr+uY>1I=0#^B3>S+bQb--oZIQ>6!^k7>Kf#gpjEz)DMHUNf$hs&dRDcJSU z8`1sX>eNedK_Lg&4qRk`D^Qxh0iKW+{8-#fn}IPAhq0tq@*jgGGhXKIZ2&qQ()-RS zG$m-Vo}u41Mi3=%u9v|l2OwNOR)>yYzAvK!5O1q| z(@X&_hA#`;fL73lJ903!yF%?wAQ3T6;1zYQd2Q*>YB-F$;|G;x=Vi?rEz??45C!U8 zwhYn@m19gYQBHuxUMyX|6G9|vydYiF={ZZOPr{4ZGeLh1qv{1rXOIH&NOXSM^ArJ?$YGXtK`07 z)dnLo&NM#GDWIB`l?VAV$FIZ{p&S6;%~*ayyDAo9hul%|@W8B&iaXrNe+@M>k#0j$ zz#!f@gLVONZ!B}vFm7Nv$Ej1OR|dyk7_?PA8B3WN*j2(AC0+SW{vtTtk6GG~aN5{= zA8I*R-{K6?Ojos%&1-mEHMf~D2Ce1`ZHc{%8$<`QXlm8xU2kL*26ubX+eR@*>#l}u#<}|OS zl5$8}l|zFtam?Ms6wa`$dG`$3ythEfUsZYHhEuzS8`{xGYq%YbZy-W2bEWtkKsihS zbzFV|LZrRN$W=hQ63+8Afg>2}$wQd@k06&ddO^CDIS6ML8X?Z-D-szPc07S3b^sVW ztUf=1whA$1)@W6$6p#zS%S+niuOr_{9WQJ{+dxOv>{1{NSMR)dV>KhddD(u!&N&zX zI9e-n0uW$Xv-YA+ct;#_m^qGjFk$#VE8(`7gtU#gx~?}sloxJ;HZ*1E@wgPyJ#v{Ng5=2vuwwm( zv?R{K&!2lPB~w9{59m(_c9k$mh@qw+#Fo}Uvl$3`obay`{I9A^Orjwhti}-w%@p9J zj&9+)FJ^=x;PD0cn~c+#kmf0WzlOhp5DMzXOUpB-T*7$C-`2IKI%or}lTHtzG`JF; z!9k-;9U^az&Wq%A5sPuE&qc7H$Foiq7si-nw5HR{vpV_tcqsaRm$j!}5ACMJHWti* zrN#XcWJUnWo|dwZ9?o^bJhYk+WLZajyHpB7T;5WjIl-*X(_1T-vAol)5_R6K1!>iJ zdq%>5X;v$`hS`C}dY)>e0IILI77O_9$q3TyPsjFM4dVjV7-;Rv zHaMvRwAiGJ@74iZ>oH|4E|;L&5K}`fSadaH2DC}olkdf5^Z`L-_2ISBge3W_&;+kE zLtzVmCYqQ?VUWcQPiV&}2l1Ut4PL_E^OJ6!a5d)AKn%5}eZ^57#r11y$uQ0V{POdY zu_0r;5B>vvdX_g|Di_2qfok#b0$c%bi`T}F-(Z{)W}SL#r8kTa3x1~_=R5$Sl@3y3 zq~x;$?fqQb)M^n1yI01efglybuNC+ z<{6eToOi7fENlMM2DrS7H1Lk2c$kGzCk0sg4PB+7QXR{~Fw|QHu=E^%p;m!n9-j=S z!}&o28XIt!NHYuo+%62_8SY(>EvY#rx;g1!MSi;}023_mDVagSO0UjMX(9(e3fcDP zmOQg6WcuTp@wLE~i}?lm8FJj|<2sn%0b(%r@0io30JX{?2J6<@wlqV}fbb}t7yatQe%fOM#@c$H zAvbx|E2uk@_tWYpkRCaBcg4+Nmmo95x_q?@uFXN3|=^5wKlU(m&#HZ=h}vNkZFG449$x?E%hG?J{wvx!pxvjaxtML=ox4f!B|NXk|cgKwvfV-km!bzYr_rmDLHctqD> zQ?DK~!qhv?(K|+U{zb9M>-<~&xUn#H+P%jx*#z1q+=ye@yI=(AhYz>_*kpH6&eTyH zT}Fl(fO^qGnWd-Zz#$6{)tCB`&#!bmEAr2EoV^q7DG*V}Y7F+0thG21FNx zovMf2Gw0$d8|*Z4%7MRxIf!~#r!S73$`1id0Jijs&eQA~4oV+;0fh73yXgwkI%T}- zJ_EYx=@g`HxEweEUYhv!CUegFK6*e4T2;?EMyk6Fit1E#bpS4>a@M4Tc$VwK6{Q8c z1iMY@7#w$z1iGhaA1JhT>FZX>xgl}sbss(LZuIcnZ8;E?&1N2GlIN{!7^q>?PS@G$S zs|zT{!Ppwg9O(P zf-cx%?mb0?fw1A47M`OzW%(NfHTF%zF_?ncDey$jQ1AHblePt2^xnb;jv?(DFIa_S9!fM`4Q)M2y*V+jN|)?m)wWyY|u&po(3K1AW~R zdG?zb!EceN`pDUKHngtE0X8F#W6=%9V9cnOv?nGXrd97kB0*eaD@@PJ;WLGHDqDj~;0Wi=ssxY~579An2VpyOeg9B%x*m^aER#dnHTK3iemX>)tZ`N?n(1L;K^#E&KB1c!w}&y(`|dbMljNI&`7UrMw~J zxVR2KGQAWFN7HeFatuvf^&i|R2i}Pzw{%Wn*YqHsS_Y*^36mrwZP@)Au%&XsoijdBBT02`EAObK{tne+SJ*EV>97a#rbip=> zjw8hSJR`{U&-bDH>8BB4NZHqKKjqM7@rb@oV;~mn#@^<{=@_=CM`+vr#VQfy5T4uERL@$_r*#i8BH_yD&c%N8B4`Lw~#7RMbsK$nZ}Dwh8# zA;=vs(srRKR4*Y>od(QHX(c1)i}M>)6Bf-0L3W;I)VyhIr0FO6Pe3rO4*2dmGm%=W zYY(Jw@&(^~n1U>`_VuY)XD9u-oH7SByfgMuza)gjGsfW50$3jnaoXl;6Jmuqct`a* zH3tWgIRzjyu6uo>x**tcV?WDasYNy<5e6#1d%+yvdu=k-PHnHVV)0( z_Sz=I3gfRpVMDG*$B%)fKsPa+Tbo(5M#XQY5#^9Ba!H#8W1!0z zsmIolaaL>>XmgZD8w&!;+6;Hy6bz4tG#Q>JQ|{;lSV?b~qW5zBEz|;zJ2-_6s`x z-3F40y#yL!rv&j}ZCWQhrRwbhOU`k&A#*g3%SP&vvqFU=snLc$qHwaG8y&`ay2tFgPD zydX-s=PaER(#Bn3WgWKw+60wVtWMhjQqtB?CwWqU)YBN>HNB`WS_bWAsGocp8R`8p zRi_dD`WTHs6o;>os0qM>^%mRNS`4D1lT(G#zc4D_kDiujV_OS`8ORn`9js|E0wdgx zykg0h2OGR%uxSC%s8>EC*9MX=J^#^%heGX-j!Ir%ENr zb;pyM>H#?clpIePjh((6QR0>62|>2fDUC-?9Ghc>=}S7#GKIEU`#9B56O6@}wmhl~Jo(3#2+ z$igs6lRS6Uk-o-bXbBb<=q3Q0hljcS3iht!dmJ;TBAs!l{r4%fd2%#Djbi4M_F5hQ zEc>0hzLR`qRViD16t!+|5rU#Ut=`(E@i(9)w2QsyN19UtsJ(ewakq}+TC%3W83Ah6 z;Y}GC^e#!59AK4)ab$Q57CSxiK7qED@%E1_vg^FJ@rRZGx1g=^z1(S)8zE*Tt^4N( z(007*7+Sd-o_tlMdQUy91+*kv9E(9&aM|Ac7WVd}zNe{?j#yxsF36&?NyMww z(wQJE8%I_tegwn>K0KHM&~jOK*~gBcAX>l#fDtcJhr5v0ZfE3gZ&8QP(PDxytwY$} z@vUz{V8anbR5$=wrRn23mXLgPlzyD*Ye727Xa3uclFPRPX~nnf8ciK{M#B)K$B>u< zMwMlRxe3MTk`d5ya}&*{vI)U(+>+3Vr?Jt_xwyDB0bz-Oj?C%O`(w$ zKY`~bGskVjLJZo%-s=YzlCP<1Vr#&*lP1J1g5ZB?Jx;B)L0HIdlyv|$!+KA@^pZO1 zqK}bvL6>XQpHhO%Plws)?IQvxj>Q+vAP3kK3tI1tfz;+*$EjGzut8kZ-vycN$(Xkm zj>RjY{gNq2O(hP9%^bxDt!wuOATu}>yTo4`r4so<`a=t3aXE$Ow^rK(VP1PGow$62 zFa>#Am@ltZI5Ajt38j2oqH&fq10o5(2tkt>0W9>r{zX$TD#3h#@4?tfy^AyfSj>2$ zZr3o5JK}gLfSBbBRWpT*?oJ6aXqe&C!E^@zx1wX`eGZu$ZDmNWJet}f#C3tIMHSs9 zz!GwVgx1;tTZ~1WC`o}ZyGpCrE_7M7Iw2TiF+pa@G%0SJjR-+QHp4Um=b(yRyni<) zKy6nS{U*>7p0g%2WJlnxLfJYOFYzjJN`QrwhZnVLyf)Bf?*lMGhp`@~&%UsdN*mPj zskaD$6*~`XyR-qwzzPF(9kDfx(^Y_yO|+w~6NXbNgGR4%gdaI2b0ELvmOJGm`W2Qj zblI6%R%pPU5MrCI>Xn>bpzWwF!*ECxPJt^uo(r1-k&yj+bTz%D7orYKU)0YZCs$O3 z#ucm%=QMjWeksZ-$_xVSHpLs;Fb{(iX@Rh_v&%fHH%&)is|*Nw67lKsbI8{zW$Z zRy4U5$ipeK(ef#_Hem?rCnP!m%8r)`Q`4KWO4o_lw@01~;7VfX3@6}-kaArfZx_W6 z$~gPzJPM7WX#f~B_GcXv_uF=%Wu`58JPIeHhd=#c^mNqDiq@MmfWG!e(Ge0EdvfC@ zSGvbouog%y^;@H*8mzL-NF@T49VRKHrC%;@Kz9jo&9UT1J*Qn*q!LY@IZG8f(OP*X zW(>5U`7p-jY#qQ8FQr6Bq%Q9c!+cg_Hd?M-&q|?Bmy6Rk!hb_yO)DA`MN6c z%U?;41fk6X*rg3=Ido{0n(EX6TYYYOp|2BPq6>5(u@CJ^Y67e^ivv^zTHIc|RF#9v zmgG08$IiSIzQswMIj%{31%(k4vZ^dQ?yKg~N1K-0O0W9#| zQ~2ai))`OwTG<$cMK7WC1kwWNtUOKcgQ!cGEwzCwhsv!nb8E~o1yet`NMi9h17ZxW ztWm4bHbDHm9AKM6`&0;X6O5N`+CcN#kKdo^fD!EsjWva&tnz1Ky3VHyI$GIf zK#YVcvZI4M(>meaFHyBd zI3=~nI&ivSj7L|0X4{vWvqcC>oIFNfg`~i6(srP&Yx({O7#+BWK>c;=NCp=ynblZt zC!!0NL+|6=+XN5>0-Y{mYzR_4Gc zp6$6=9d%5a9suDs;LMLbCCN)GZp*O?(gLG)c}cttvT*usDcV4X0C%j|yN%n3sf*BA znmB>mtut|jP0cSOfHL*dBg`&FAf(%S4-GDN44=vA&!9QTYOID+w1%Yb?SWJ6sXoQVtl0M5dw7fwMo(uJ580_l{mx`MiOy7H23@-pkKCe(r^PaK?LBy0og7G&39$0?|Y zEscdKkW#PZR8F^!iC-Pefau4XkD7F<`Qs>8r{ZT^QLl!_)Bq;HQp4~W?}Am8;SMMO zlsJEO0p;p^oEV_-I%@Aj=zRc<6?V!W1iCf^+~SRn79@pnq4HW(1K2tb%R|s{oQIFp zar;~d(AIMo2;XHLQqG_);7SY#BV+1=rl)pV$FIk8RO1@6RHo%jz?8N3Zg}pR9>|kk zKwF$R^DzY?>e+Bn%>ZRBuy~C}whjow>~}3PB)z=K>B&P4dn-(F+CYp^{^(=}9#7CC~ZFzSBq+fmgqad~45jl8^xfDI;fvO^r~ z+;tQO#fd5a=EnZG-ZGjx?TPYHDW_eEYrO}K!n7(h=>!L^+9P8NU~8R@Lv28-rB{Pu zA=x3wj99p6j7TqMk>{yf$B+EvTtN3mDbChxv$Ur*z_U& zs`aGI`2IL^0zgN87a}$z3PU6t=>X}phO3$m&wvsEZ_$$W!w!sY739s9Py=(wiS$sK49qw}S6D|N<`@aWc&d~M=|wTuwAg01a%$9Y76%bt%P&CZ>B ziQHpI+X0D|kdiOZgfLg*8J*tVt#j)^2CCJ$?>ULOkm@G#KFsQ5_eTFcaI(1La6_p1 zQrt`}NXq8-sKO7@22v;YhljBPpfolKXt<8>(SxodVkaeY#DLewsYOTh)-W*6!9iXehORE&@rh5)?x*$a0gfDzE03w`Y^)rGrKW4V5 zS!cRuk?J!lf9z%gq6pp;#x}ckzHkN?HRKEwIs*MI_d_B69nBe`mgtVNv{n%HF^rSZ zH%8HXo{*shAQ@La)$7|J8=l2Zbczlk7CF{RDbPl|>`o!b&5jqUv?!9!&ZU{Yj#ZC< zs*+d&dC!sG|%(F+8)N zwaCbbE6{Pk599_B`v7W0V}dA$o=bf?G23C<0^9{5=Y9AL3rHB~+ejq>Y@D-d&@;z5 z&jG-;(U8dClpKkieww&t&|libjBNmnkB3(m8ky-4TK}gY7{n7AsL(Z}(edkf=m$is zV|Sn&q}5^nkQF_#IYDkk`^rOKn-c(xxS6RRdfRmrAEsYMM}S3N@-qoj z0>tPHIzn3Ic`f4(fUc?27$CVN5XYf(?63t2Q)w2^HW-Gxrvsoy>r+)J7>2QhAHBEh zfXmOaWdtC>DM}haVy#EsNNwcMuwbo;Mq_~c8Ea+y1u-q05a`BpG@h#1b?%`aJuINn zWwhmydGa5S_4q_es7)D{;=0bw6i9~6KxpQ7=0^Y|!{71_|6*!Rfa_NGNwq8=6M~*`OeyB` z&X3=Ormzb@6utdcK$>Y>8Js#sXSHXL*fowOA3)m_xBFK!ldrE-+oqehK$NF<8QPHK zy5g0$qGp{)FVm+W%eY?poVt$R_S2{t2uYYD#@U+@5Cy2p@({H#r$hZ}+t|rz`>R?u zOn~Yqqp;R42q^~l(+mn|EXO61N9mye3&HuHaPiLwfzOBr5)P^ZFuI&kjow}UlYC%th&H!%rD}xoRa^TFDPqHPq zx#INjam)0ftFJ*xXyhq>ldc1Db6L}Xo&sGeAC)Ii6s(4T@v=JQRyaL@HhfGBuwL#2 z=SpwO7zm$odN=|3W9+M8`teseg`=&xgY?BLLfA zjs3TyG{}taj0gX7uq)zWO<%`A3fiH$PfU&o-GzjA?AR2}z)#D?V+~WFdf%Eh2WF0g z)xGjL9XK1y5fnTc>HO=xEr7Ui>Y;7&bqqcoNQ=U|k~B}}7{m#puEG9W!}AWXaX2s< z4Q)h-wen5q+{yieCSw5H?vd{Y(uRwtWznq8cp7gAwFpMjOQNUH7;b48TVpd2ih449 zVBlj7$+4CQVn`M?2jVIwAH$?-6Ue+r=muuRh*Q3bWZ$P`ancbxE8?Bk>WRDXnVuj}xWJmpHS z1h_`^j7WSo2%(5m`Z??z+QoaLX$-bKIV*#vK#f)5x!ncWW^oS8KMGM0=;D`qYw>{d zln_a%o!1#OLY+d61IQRhVhBzaYAg8n#aaZoD}Cy_&=O>^w&OjBP93YA&D4nTLpK|` zAoy*@Y~A$z@#TOnEf;Tm=iq9lsk76nRDuvsKY=oVbhU7HOP_*W0xZx1-LC*isdWbF zDKtjmK7oc;N4()pFGEGsJ9334Co1m#<(L*&ZTyLNlM-NN@>EiMhX6AhvJ^_4cK>av z@Gbxu<<-a_5#of_T_b2~hfjr?%E4tvh&c`rcD-|P7@aSi}(1+jk8sZq4Oy=n#e$x@y9Z4shci46s4mv)wAR0m{9FKJql z0XQKT>Bc5s z;mj0-5L199Jj(DmtCR8@qSWyLm@0UHW|G_yEE$XFJS{-=CGmp_Z9=SD+;Qp{?rYEg z2~-7ICXA#F?k)i+z|W~6UZfqHpM#$QY%888^{vtwL1y}j+t3k&o$o$#8vX7J`n7rqtp!c0@<2lyV@=Qnd!4QK>Z*H&8KG_hTlxf!fVN(Ft1KtLRiIyp z9fK-wbiaK4U_uxo_(yJc0j@;-M%+*npn@Pb)(tMC%lR@TgcfkuJT2LommkoGasaR_ zeC<*FRMhb){VHM0bo{bm8`{bCeo0ybWrFkSe<{Gs2HXC{?rt5@t=#XRp8#XmxL8%{~T|l^Ew9CpBgTNc-Id!`dfl3V_ABLtTIBIF?MvI5W7�@n8 z@F;JXC#NIZ{@Z9|A3;$dMY7Av@}O;Sih&S40W^PBSEvbXHT6`VjMXIs?v+n81G*l| zUlK)HUjHg{sR%x4Svn)qF4#7CJvJ#oRNkROs8~nnSE6Si$)}5v z2hg@^%%qaLA_~_l3PO|rY!~$${PdH|`Z~}srHOccn1U&RPn8?5Arha~hD!!Sq_f_& z8G+F4Of>r(;1;-1T>u(8^Zuh8rs1If1Xvo6*ID#a7`uc}+~8b1fnR`7WS|{_DKyev z&E?CP6Y*=FTDTm5;K|cd%w&=?mBsWbFgn)QSO3~I_PZ{f8Z&^YtAgEhR`uEM0<8iq z%ddiE1i3!>0BDDDM1U(r&(!1~#5|2r$B-@y&l}1DQRY&7NuBBy%nS-hXP^EYT7s%x zTi$b?IRSTiG=?64Te# zXIzpSDFdR(LwvGFkZy+@m!+0+FlEN$e`Dw31~RWtp(Riih-Z?FyN;x>+#O%i8s4Fi zcnYK#HomD-XtP+|`w$gz0Cv+jAJ?K#_0?61t@ss{7MR>P%uo^LF`};X#hu#B^xk8d4nn(POV3|UfJNe}9~C$TYe0CHz6O&zl2>5mdv_#fO0Z7> zz;t>wYzk>vd=VDt`nAhLJ(M78io+y+LNU3!O60v66151jET{Y~CfJ#Hgu<_U5aM=p zKeDDEBsfXdE~IChd~B-i)r>%l7faa@w98)Kdp8+!#|^jWPh)7x(XYf#>SV{`*N=C> zn1dUO7_R_WP5g{#4YZn!ihf3GMi3?G$=m}-Gvst8sguRPKH*3GwZJ63q>6PE4QLCG z4pj?w!G7QZaZ`ejgSUu@?*huv_94G$NdSuOy^m@cL0iZTt!{FlY{ZV}7}64*h^?^+ z*yiOpTRf+`O9)(inW6wt5c7nO>Dk_?Awqh47wDR}Q_6yD{_6cSWKPaj<>OF^gJms3 zEP;04P#kPU&kxr9{p{39_P9qZ6x&roP#Ixb01p=}0MIvT2G1h^Lam!E4nAkgKlXbq>9 z{F;j4YjoVTRl`M1>Do06V|IY3#73T4u>sx-RVDyS4DC?Qj0P*w5Jw>E#O@*b0^LqH z!sD%jF+mnfUs3C>pkyS7j_ABTe$UQ z)VEQ20BPjR93w=1=+YYce2<-wdzEUtlR7fiy1O;xmk>}Ihzv2}PV1=ST1UrWT zK$q5Jkt}N_-&o1E|2FCvgdusl9*%7QWu93)%TNXFZoqpA^t!apw#VwC3v#{nbqiJ> z%Zwm3@%W0540w!#zK{c46eoD~B?8+ziwmd;!Up=y&G7sJ>D@<3S2*8knp;ncH^vW~XsGjQ$v0JEp6lu1jux0?57R2!kY;_aevP36q&S|B z((&OGgdj_*o30bG;Tf-(WKPIzR-@JkrXjexq>Iv9g zAa%vVZXZN5fbDrYJTFr)f?k5f4ASkB`C9#hWF~0(HQ8KpkHjB zfGC+k3587$b(pBAQ@!NnG}{_Ij0tBT+kzcVBsu_N3*7-pUT<@1#BM=jKZK&f+5pn% zw|hE}2rzgseoQF^Q5ffb%GC|nrlH}+{r`**D?IPx8lyTuug9xS2|$UfuDJ^JpxtcZ zmiPn+-`m*F+J#hu-`0Ov@)2O%6cF*y9ruce|4NDjD^vf=X)< zf&^>&p}DqWk*w;BojSc9@zwk^CB#z3k9~EWeM6V(Gia;3ycQ28j|i~2dC`Hqd7U>- zpp1d8Pd%_V0l8QDx%L>32rCEBCwQjidr;namRT^+BBmgO|Fs{cvCN>&o#m$b%Yh?@ zS{o*Bs3a-hpsFz&Ei|BZjhXq8zEfjImsOJ!Yim;NNRc~Hp6m&Kg0Ki{}$oKPP#z9nwWgWBk1W|*pNO}0LuO& zqf8yg-m%7GfX!F_qWJ*w`;Xv95p_tisyYw^(4=m)Ys`;Yp^Q9Fd#j%<95Wo9~FdO3img#Ow9<0ng%+?%ngXn|p@vAhjw#wRp(c4~N9U#1kO zy78_m069uWe+IHGw&RH32yAQdLwvk1hI)j!De6X0+4jdSLn~Z=10A}F5Ewo(@?XJc#-h!k6h971IZGa2#A)ii- zPlV?@1-qp%?bgW9CBP~z^?ipXBSZqNXstFf*ng{Bfq9)a?#Zj&5`wvOGKs_!P}y`? z2-AMS|A(%3i*h`>&in9)#bqUM)?ya2;s8hE5Ls3Fi zS5>3>QU%pz8r}3L;7}4FOBNy9p-Q3@Vr0#)qR$RSR1g^J<12_}*kI?LF$ zdslZ+x(>IpxHZuSTU0AvO&CCMM?<~Z-Hbi(kt4W%mxLy`&3%}|(;hMuM9B^=nVT7` z$2045gXy2?kVtMTva)!Aq5yOaU1hjZ0%byt70wEnoZY@hCUJUAALCUFI(&n^o<6Tr zqMAmmD5yDWgJ6iG3h$9&111OKAm0FS zru({eyAL}^xM}xemFu=Yc(A46Ko{T!Anc3u6QsmbY#o%{7`_`NziaBxCqR;FX(}+q zO`dkxE|)q%Dq(f9hf_5so(j%AT%@%sITe5l9r1E-w!U40N>oQa&sPBv)nn4D+IjyL zIm2nD-_i0`6G(BCRb-z#Y(ub!PkulaQ|L_%Q2P!|b;usm86T`IQYzGNlgyRl+a z1!q}!pA7p;KR1Qj9N2m;3Weg@$L0I1&+n%1b$+66w5Y8hr#<~Wtz3=3uFmuwy0|uh zVEJ+@iaY}y|Mghij4?5^S3dFYBgOx}|0m0jcmGW$4j)n>s@%>Z<~Q%(Mj{Euo0KJ> zvnIn14-r|#0cd&RZgmxC&ey`)Zyl_1NAV$WGx>7#gmTyhQ}{j|=heFa=jl*->;uu7 z9-15Aq#*Cgpx-Ek^2<*^VmSFoUHBAWF|9J^r7q6=;k6hw*?qJDJ@;r)4AN701xQw> zZbp6JI*lXo-8DS z2;(kK0gCGU4Kj(-nP$|hY=WgR?Z#CdtWMnZ6zrC-&L+ISr{~SuV0fnxW*5l2{`PM% z*zM!YG_p8h=+NrYvFL-CMi4Aqj-`{+mrw;|nu23QO+(}v1joooJ*Gi(D9X`Qboz9m zwmT@M0CN6b#6js2n3OPB7IuCshH&B>11Tz;G9mkQ`%zTl!b;PA+PWjZ0GEwrLUuvc z9Qz6yFZCh7P8+`gP7=Hm+vemHND3^VQFkT)gnjZp;ih(f{2oe};dYA^7Giv@3!3xH zL@MmTB8 z5=EyAa%ab8M_LQ$ra1IKf=V<56+u;>0d7nFU|okgM+tAo0jY_xPWr_uyD1p{R^!|q zGDyL(ONvb92_M9)FSobQz&7Qt03zXaUF#^}w)8Y|_TBFaG)W>hf~x>?_IWGXAIF+& z6E~VWQu~B6(<&JvM_mY3!S(D0Nk0YAAvKLS65*LvSjcty=An%7Yv2JztnO+*4>v+y zXJ(D`X6a*=k+m{7hX6Cr$yBbrn2xXtM~lfm4Y`J>sAN*Bgqx_gy+}|2nRhPfXf?A6 z!P>v06AtwR!zO4GNJKl25p+9!x=JffDqRT3j_w-YzC%9NbZdZ{I2ZTGMurr>tE!|E zhkOz>7`NG3-=uPp)fBd-uycov3rx9RStxcDffhh;hLwA=h}*H;x81u+3OXH<#pK=g zOMZ0-&NKFDDbWeu6!+~E{5i7#)#60R3Vj~}X=ALNyy8Z@bnBs25G815am954cJ*LA zfr&1vN9NN+)eXkQZPyu8V#NR)`NoZCRhm%oek8#`W{pn^AeUnAO{%>pL9+m|M>#4G zh-+OdOI6&&=&?T4l*%!l7fXAAMcY%(X1B&X$`w%F44`FhElSCd# zQ6tjdSTrZM0M{y3!RUx*_N3cYMgiuh6;t|y1wWh@kIt_Eg+w?JKsIp` zEvL}|6$4hQ&_XqR$E-32o^>b|lc(vi_2Ex=;)ayl2CXf1$6WyI)1XR=eULfkXfU(j zLBdaIW67*PW$#k6d4MIvWHYo20gf@tLSAtrt@?Fxiqq=`h+%PWBX=0-(sirzD1i8K z`4EmuxE<}m-bUmX0-2|$R;>kCY&9*0*MVl3d)|E91QQW8lYFM22RhN-wO8!W_HirM zh>RX_0D{+*6*7o3D;AjRlSn51EcOhhxSjY-nXaAryJgay<6H5>*EB3!cb1_P`LPPo$hRxt84bP9nS?8rQ5-R^x2T${MbD3kUd=_#x>&~l*j zVQAk4!`fC%uloKjt_TfMKyFv4)PgOrma63h=nBHbA=}fPrXXG?2NJc;!;?<{?5)Qd z`LhrXou68t0>~V_6Q|}%U_#!ydy`x`B<82pN>o!w59jOlWw}VVbebSfGi6~rj{CMD z5mij8y13znOIyga4{-I+fM2E81mFIN*&hVQ#zU9dwIs`s%;@IH2YfS4k?gW)fQV)g zT<#gogt3n~6c{+ta!#(Z_?%Q&Z!UoGd-q*d<4d??vI zZJGkyi6x&|!YqtvegGy1 z2ir|r&98zM~!C{%Swmf_h~&%%^))1PEd*td@w4e~)}Lt9-3Gu18v#;C*C-oC_x;g* z#XYclU27&I+|E6R+{qabhIYeH))YX(`|A&VTxAB%bY`Qp^OqlM@gw(cq|e2TI7MJY znbW165(uUr>IsjE-S57NSgW|R7L6uO)u9f>spv=p(r!W^Tbw{r9c-r$I5F;Hm{1Va zbAtBme5wfzPDdkwQldr>oYM_WBqqSDyhfWiO*peAl!7lG(TmN2j*c}Jm6P1BMuv8D z8M}bnQ8TS4uL<7K5=#Z>4w#|f?tz9GiW& z%cdRdCd^CjDz@OoqAtB&R*MEukuD=yH3V_n6N*_~(-Q6kh`tf6S|)(g@NLGKJvrD< zL75wPyEFHnEGiw>yM5fMkVqImJbJ3BTM3fW;=nj^09u4RT`5ghA-IQUEcygP2n4D_ zBg=PI$-SLnjT|SX=SVw?UEHql>RIEsW8dC~YIg>>Ri$ENGqT5luK!GMt9Go%rH?6y zbe=JAnAwAZDnm|lkcGg}XvWsL?jjShYM2MAuKRVMLh`(-rHLZYg?asnZ2^>^IJSBm z8Y_b(Z;aj}5P%D|nzbd6rS6 zC*myqF$5%edl}jLGd=30wY(wF;$RUu_UlIu`;-DZ#)%}@!lrOd0A`5!vQ|4!A;3_2 zOm%9Ej(8-456;1k8hu8$I)g%d^wdOtk%Hf64GE>D;If;d2reBQLsd+Xa$k-ty!x3> z=JGg+h697&rCtN9J=+^9f)UF~W(1*>yg9r^Wh&d7!$UC;t^kSZ?02-YwPe5=lnTDV zP8fBl`KvgSf?pkIQ7rpmo`wUnLH1=73qaWKGE{GwuFExO6ktwyP?jdPV^8k(P^Y@d z)8qY~JpojwK7c}qWN%U=k3lYkZ|F+VAboi^+Vr8lxlvCIr%45pLmtefwvP>V4pd<$ z>BE&64V;3rURtkmsv)1IAZ#?#U!8-=S_|h4CtRHAw%bQ&6PQ)2O)s9$)3@94q|2qh z?a=w>-V2^izOOX$0OHhxV+GTF7Pkm+%OkstF7qV^z)CpsECZ-5>w3RFO-tLFcel_mW*HVbRSdXfMDPnJ4N5A&kdyrW?ZKj|a zlA&at*rHyv(6y->`T>=dBg>|?#$OAR^p$449L z0C8vYX6C4VYsadycczMU4Pq|rx1(*vxj#ep@s8~!kWzo9$Ctv2V(!;o62hie9)cs|#HUuE zz%CXslP?<5Htk9(B_O)ct*|mq%X*UzuL2^P^f1_xF?>w_p(;letRi|q*cxt2uQ@Cc4KLoN{m}#JY>9udO_yxASP5mL`Zjq2s;K>pWi~b-4yVl3`+HYaIFK! zeVi_CvcVul9H+4BC-g`IAdIf6Qyb!To56w8*ryzMHw1tFz83v6@W7o)(4h=ju+9y z@!zULvynoH&T%06YiQK#2ExLS6jPt8@2k=*f^xJ z8j#^0Qg=%^rW`}w7d?vPeFQ1=~KPvQw3#0kV(fc>&So2F_6`;@>vHW zlY4bEij#zVQ>_VrKaBEme*%%Tz!rX(M2q&=;d$f(E;|5o!hlJq!@7x}4N+L@K|tsC z)m-**6T1u#M2$NJlVjE=G>0A-$q)mzaR_qJ!wSuBO@D-Lp}4f)R#Q9%dN@hNLHQn` zOrVmRv!06kDS)(}i|uLp`qlEMP%tp7XS}YcZeW%uJWw0vRW6`dS!;0(ah@=Dj}Xo! z7>45ZIa$y37P5GtonDj;!QblyADu23cGmT%L=iVx*LnqMm+XU8a&(QG9IV8xnu>!- zCZw=rgc5)5bEw%LJ<-}o&F%y3-_~&xf!`Bv0C+c#PhwBgR(d3lCLWe1Pye9k1fv4B zfv}3b26T*@_%Of&=mMBq9v#>$AK=0BkVR zi@lK%&?37XFXxQGSp&qm3JHCZC{(Wf9cC)0P~eZwno=P z%jABL{=(nwKi9g?9N>gh#l+?%6ti+8UdPM*<|60pLur{X4}ti(cMzRGA^Gq6CU?<< zvtQS(IgO4=5L}sL_$w~DmK_J`xgWhT5V7yKC?QL@vkDMTMWAp8-oaGA8onCP(Pb3? z)kDe}hE}X`HWK0Kdf4D;LU69Au&Am9CPi%Q(puwogc}^ZX3HAgLn4n%muwCMnAI9q zF4Y_L{C!+9?K?1;;UW-<2Rf-Y!eI0u3#ubsL)_V1aIc+S0UJ5QmQF^Dahn-EO%)F^ zO(3CL4K7b{nvr`Ok78Igg<@@EMiAu#-I!jZnVQ*?dpR+Sz5trDdvSqfp0I5SUV;%y zyfl{k`w=e&Mx*P=8F}O-L>WJS(9c@m3;Wwo9iA z+7OVX%>Q(7BDuvw0bL+jT33tJ!|jA_P$T*vi%pk#P_TVDwi+9AAgB41!>_bSLkQN$ zrWV>p>1%G?BA;;zk6{5_ww^$6K6pgbwkS^_EmmV+zArL`>WJ^CqMid`ORL;*sB&hU zte0MZRm7E{IqtvliBF=w|L@-iJ{|ERUE<08R%no2RX`qKQ7ltB3n1cr<+HD#cijGJ zU6L&UEhR3ttM3L=4)iPqx(7IqGKEKrMm_iEj3i>TQU#LEvsNs))j+7}{xV9p2g~Lv zA`OsJxqqaYXfxr1JxvE%U^B&nDe4Bo@8P;eVJYadtu0#cZo=FSq}S}FPxmw(>nHES z5X3QcmmRSwb`B& zxW9FU`V4|0p4>vExUIxH@p?le`|!`bUf1-8+6Y%mZBL?k5sftv>2O*=iyi>8e}9i@ zC{pOA(D$Uo9*|VyecHa=k2Lc>#%&Hbq6Z#;ozE64a(p<1@YiBl^Ru6&4ueP(BZr7! zY_C;6(K7*A3%qcEIunddgDKGEz8&k#l+bg3PJX7}L8S?8Aeajf>99^lIhZG}U*_45 zOOQF%QA?Ei4;CD+zY{k2@(>81rR12O6u=U=4%-xQlbhRY*{OxL53L6J#7&CWhbLIY zfQ4Q4!vKqfr4tR-su0WpJIhpp8kn53ITUSq9Y`8CnbFhWG(gtOc4msV3B{uDBcM{P zKS304kzgBSN#4433*Fjtj|B%h{Vpg|SeUZvfyv!{-Tmw1raV7KH^(^14t0mI{{Y|! zujvB8(0*Qx7$v_oovs-J&D`c<(Pje4`KL`%gLVqRJ<`k3DV+J=m)U^2rIAC#Lyex5|?_SusCmPY!Dc8F%Q z2`1ZTY?Ny$v6Z}a!VRs0T@W5HTvT1_K!N!>p534N_IuP;AnfgU$k9(NO{#?x`#v6`YkOMle*2 zS_-ha^?>45PxxURU~Yh&KFh)J)NvCE>^(own6{O?Imp)4nm`*0%y2#eZ96C`pf~FR zETA$2h+aZE9n}ZHR^59fv@gdHh~?;j3=eR-ejd;u>}hRa=7{yuwIxem6(QJb=b#n_2_0}K|-`+m+i8^kTGtMVbvjOHjtaz)S zH6d97`!vI9!dm{CP6$2YCS`IIbE1wR8u2a!&H7? z{1V#6Nfy$}@2|)xxZu)()-tpFx!;XIvK*DiiI8{nVoL${qI2P9 zjz)|{NY+`r3{wKa2A)*+vi&&L6eoZxK!l-acB>w20A@30%hfnQ?xOw9YZ3C@|7y$EI@KTxkX9*MKIi6(Sf8AZer83?)srl2r!66NEoab_zm+V z1Fl~nohra_GF%|p8d&A8%g)x<&?L|;=QW~ffXoxs(l6;WA*J3-?U(8*H27N6ZfQGx z!tn|^846(sl9S|oEVv7H8tUhJ3BAsd;yy@vj?JE90P${X!C`>g(&J>j4tx$FxXIx9 zbu?fjD2}&@*}*Z$yxSxl6=VX*#dKBs45ta+*HX${+XS~o#^9A(YpClK zk`uTYPeTEb0*q>Wfquc3qISrzLZy)wG?l-wu5#h;F?|-j!5oN`p6;YaG%ukba}Ic^ z(&hf~LQa?u(%Nkvg2nobT|@;bIEZY$+iG+X0x_uLg5eT~)K7-$kIMjONS!{fxlyel zA-$uMZ=zw`h~c$&Q7_IcBYH47mPKmze)A66#YwCjX1I=G0ghF7kfY=@At4tIs``XU z3v7Mg<93Z`)cBsR2oXegyU6CZ2a0 z={9g$bJ1;4bs>-wswZ=8*lB^0G3=MD+Vh2qo> zj?_*AohMo0LBU{`ej4BJdcZvMCp3+5ZSo?0Qe*D`1%t`k$@{cImv-;To>}heArQ8c zP*)z{WY?nV0?8!nZ9+vOohhY^l^k?2^DNtwEb7{D1w>-;yucD}E1KPD1|ea9uL{u_ z!YCPIz#7#xxS3gjok20Ds~o4N!a6(|(N@kJguu>N8nFH6 zO9)hnRanmb-a;hta*FC<9ui!g{7D@D29Po_=}JZsBwcArUczlP&{(TEFGDa}tJnP= zQUwa>EZ?9x#_4!|o;`?a3aYkSooEe;bI^`un>yI4q78l$H&7aoNbr7`Baz;l5L^Op zXuQ}0!nZe;+5m4nF5WZC6yCwZ3GwJD33tIxh@+c6(A$Hg;{V5JrH|XWWS>=K`8W~O ztL z#r`~ftK+@eg_IlnwT ztJ7`;2n4@Diy6&V5$r64ISdL#35rU+?ktyaTNr=UJ*I907xkG^LN5+WODTqDXh4)| zqSUkJqH9pxXhlPSVyNanMdNsE46Pf`T&!d`yqV(RCPlP7EhtD|YyR}p0E(k-mUU7{ zhW*iEb)k@pw$7RLa61>9T14#w&5Yg=x0kon~ zC1ba_zu($W%XkTJ3Njt2wh*RszaKJ)lcjmw4zRw#i3mR`QGnC-(w90)R|Q#sGAoQ7RvH9I7Rv3d^@m};JVl`s9JKTa zuVFy|?Ss+a&OWuUh1-nXz^4sTDLDB+s<)R8^CJ5$$T`}M`Cbo9BotQkj@vBQOfFit z@6H1{#zFGM_E#LE7()5vC;lzk%B|g>G7md~>?*|`p+qx=06S-2(aQi60CJ*l{!q2? z6lk%tK(4h04H>2mv#{QL4z#qG-FT6928k&BEZzlx%)f9WG6y;NE!!2jgy1ZkFlw9!n+4rz*|AZgLJGvZ75x>#3m%v*Xf>z=$qBJODjk>8H&lH(hzeNU!5!3( z)B3r89rc4O=sGiEjY@0&RMs`ZY+)Vj(zK%?PXm}Gg*|1$Hz5$U9^GrDpr6=ft z-P!E`N%%4Lb_v@BIUcN{X|U9D=!56nJ0YFEVH{8X9_shUD0o!`z@DI+(eP(sS-8=N zbY#e>lo(=-fyirLXEQJrr9G+%B5daWz2I^lk#Av#`K8Yblv(WK(oXT zo+GIuQ1wE6RtdMI#%x)Ys0_h5!O|5CZRO>++8A0&kfYFO8;A<(do=B68f3n82SANq z-Tpe(LR<|H2|rLn+{Ep44q~3(O5ZM?|7t@a&nMrK#i33LZX0;r6uh3<_wPY*PPF8% zDxW@?{uAlsIAK;P;B5f5RPL*M43l@At4}B$OnhpRHAJMUrvOj4#L^_;x8p$gX~LZ9 zA^vH?NCWAdgPq<(9Xp-bTQ?{V_0Y}*7*XzVdqE0Mfj_T0bP2R%S+!sz2EG4b1jj6i zrvG_R=Ks`o=n7CQ4tDLe(^G^XasEY|5=*|^4%UW68G?(=3h$ixUGxexRFt0DSpuWJ zIQ3k`?aaKb3R43T`ASS;>HsS67wJYCINdh9Aj?gVxs=rzeb;i(+qbCPIL+{z*`^i* zk_*GYUEIWT&DZ~XMlwN19DSpGTNpX^S7EJufZI$mhCss!?#3Fa;Tf;0XtF;Bl3QQb zpkhtHZaOuYqf?E}9HR5{cRq)PL1y^@6FQV~p1#TK_UgdQ3?j0jHzdS`BiMNn&A=RF z2012)mK~A}46>E`BkSvh>8n7eR%tbxdcSSt~B zb}=o4jv=_G@zD*W(}6@`(et-moaSr!?lszPpo-f&gzDqAGU7gjhd9Un^i>wL2RJiL zus%v@AEpm%%Oi%=BZzKu-Z;{R#2D-Z-_|181W4{E#0y&+r%+&FOI2oy+ey@AlLci% zfb#?7KEs(cX0e%k;qPKSZ;re8iNB6`Y3;pklIiXeA9Tkl2G#i7f3_%E8j<8d6q)Bqs^8WbcH(>4giQ&S(aIrrC2K2{Lh`661D)|M8ZZ{oFUh?=9^Z z+{VF?PE z++bQujRrW@RrTOCAiVC&%t;+LSU3vC4VxR070!%S-kA`pnbaLMA9{SUFX=Fz@m2x~# zI5z$23W_vco7X53VBP6;fhowb*}B;|*z2O%{p7I8H%p<|%0&McU{@3N2@hDiL9dWP zR?;hO__*GY5Oae_kIq-e4fCMI2#bZKDI z4Q+tAI?%FU2Z#`~Wf_xE)-k$}Ncxvwqe;f8a;|Qpj~h1D;tkJZfc3l5+R_5j0D`lx zGk98T8A5SFv5qzZqA}*&%J3K@USsjhpZa`AwA-gpNXmz@rhu%d{F+W4oI5yjWJ#Pi zBWR7GSdlX067$OoNW>p!P3O2tc}tIWpjP{ANhP`W7fdrTVlItl6NL^h?!gqGz{~R} zwnYHZ@?sMDwEIW}qKw-d`0+)mdj*oqh<1;>BeDd|LgLm!SZu07A@;McYeZ5r;PRpx zVgO04w-99mH#uN`Rz;u*vS6_`jCO&jYgI!koW$lwSUU#Hj8#z$=q`w^2e3pBV_@6| zJHnf~40jA9hMhOhDpkOY3c? z4y?CCvOaNY?|M8`at_F5udI5|0%{00gJ^Dd_tY<6_zO>E=K#{u>J%Bf1eq5s)5Wwt z_fJAN+lU~K)A4M_#l-^Hxej#}lQ+kKka7v^xSk#PiJmePm-*XSt%ZP~I3UBI3O9_f z{iZ&kYQ(mH=3F%pDN!-PI(Qw3!d8}1EKV1aJ*;2JlppGa=477v=uHcP8Gpm}Bf&(+ zqkj)I3hNkBAC_ZH)bQ**KVH}~T%2+0r@;S_iu*B;sJGgf&eXJH047Jfau0DUM~9Jz z5y)KeNSe%ej~$fP*3dETjKd?1fKS0$InPe1PoX%Yda`RHR+-Kn!^wxL!nny3&Cocm zn*&LkJiSG+o*SL~37SF4APSU+E4hEVP~tNk%gh7e|5MsXE+ns~T0AHM$-qxtHizKC zrk##0nja!BLnLzcENNgX0CO?@4KxgrY&4aq;&z56iDS^?}$)AwNGH&9$LYx(xq{VCKe6wV_&Sq3` zS}K_DL!ZD*DqKcXk*-6r%o!49&!04)xqN&9V-U@k-*B>0qHaNfhYbxj+R1mBlfXJj z)Uof5%p`VkJNd1LB-R7MrQX>>my2*}n(d)EA(~7d#XCepAUxwo$L>QxDFCBHROyes zcrkHksqDkESnfE5;bra%v$>_s-tp_ow?$53thhNnxs_H+Ts+UG4;g3J@X zwx=n@CD=myMTD07<0uSgx_%yKU-J+ab%$XT9~7XFBv<*m>LiOmWW(_vKnY-;vndM7 zhLrAoXjB)r3@T7v=rF3Ga=!!t-GuepDsH0WycIoOEroo#6K#1N0;OlQPIK*m43vHg zeH&<~j;aAf_dxTCHg3{l=`P!L=|CXM^m;mR+XY#KtZtxdptHb6iJv{|JN(Wj=P=PZ zh^QZ`Um0XR8Y@wEjTpVEw2eS2G`l)fGPc(fVkC_Th$Jbv(Hflsts-9ZCWWcJDGRKX zQ-2NhEU6|?Ga%fr9zNHs^#Wx5=whSl%H00!*K3M>X{2)X1JwJ^B4H+OG4{#>oiO*K z5JdqD%WM1Ui;4i`{2H~fgp(vQ<5FdZB6(7)0GvJ(@7)73w*bbs)#Tstcj4)Cz?>Zz43yWCPM}JFUd-n?Zv**dTZFr^ygeRTGYU3nR zZ?`me?*Od8nja19LO?Ru1xA0k#f=;AJ}4_}+4lXh!|wj%8z>RCMR$Ei*A9nZGqZW)e=nOvH%WQ~2QN*(6Uc~JY5}k_d;j5V` zL4;|}snh^{>*OH?qc&@805VN(NI$4?o^_!3Nt<&c&Q zC4=GUdc5U<+v%;nMWdTOreHmZg^)HRE0_1!#cl@z>BXx4*J*r#6bjio?cpR|zXgZe z@oIKB7s@*UR9TJ0kk9!wia*O%%Qtp!&@qAS!Hrw`3*x!cY}i z$7<$43MErFowmEQC+ApV6p{Org_IwtlH`Fdz^fFl0>}yRtyX7>5S)40BhvND5)|{M zqcEr%OlJ3WMzDh08i|{baprQ#{u}CptGFp6Zft4*@<3-87Z~aecchVPBMXORU0P>= zBV~OtyGGxFX0Er9MjMo=HIBY_{P|uSGwCKzwi+mpJ+QN=g$d&CL$E3|0%pmDdJGBH z)c@%a&LD-*(f&9zHiSe@v_gZ90h!d;0AP$bwl8{9MJSnIW51vTh7X3^VezR9bUtWXb}+|B34nxLIOd5OmmHj8N#}$1 z?MJC0>RR%9OG>T-oi^{Z#A0;Ap_vci-86WIgEG`{fWf5=(r4;^QOwy`gm(;KpTLYq*JJ--KPAAK1la`4ZDd?-Zkk$mk#hP~5 zTew}lA4t6*^S~3ioP9^j4jX2*ySU9Uuh=8up8cNcPGTRo)8t~TRzQ#K`HHS53~)=s zf2nGYhErHteU+XOghkgGD+fe4hUUc1zN&tC0!ABH<6($!3Xs~KZfF)TwHMtSS~~|= zEUU`;4CrXs6w=CM7B*%Va{$pW#8fnwATqzWaRv{$?<|BPQRG1xM{&!q0CtvFXwAq* z5ke+CY|rZkcnJzhxp5Em;za0saef20JHuGP$*j;WIRs}Dsld1$)g#6dI#W`ELjJK- zNlJBqlYX0X5o*RESd9l{x(Tv8)r;CXNedcsxxr{jB5wno8P=W=TgTp1o>dj^F3|DG znoSQx+IN@lTu0LYr_Ji|2T17{0y(41qiKhH7LCHtU@u+-839P26G!Y5+^kQJ_n|5B(7*^QFaEIC^ul~0;dZBmzk)^30{rW zNuU$dMTpoB{O5~2vA!T_=OH)k_zP*N?agsW_O|i%Ua_U}BI8B%bJ>hN&MsINPP^J1DnAV9r zYJ`EAKf2e>PE}s`BZ@BX1E6f6+j+0o$8OanL@De81d{s~At`RTp`Sbe{&Z}Ns0*M> zIx*ZRWmC>mgy%l3rU0Lv-3-kU<%U{{T2?0`~@mLQPq`eRb6;wB~7T&baJBbsh) z_t!yLx?#4j0Y=y(CVvJb>+C3ylM4ght)O7c{n-=m_+1(uYRZm!NAVRm z_)tI|pyKCK2?Y@1;}9fO7`Jt$-D;RC`McY3{J3mDUqFl%fHTikELd4epWfa(QjMu5 zg7$l8aB7Kg&r|j~1gqtNR*)LND2_CCP28~fpcU7JS}9=cyS9OrAtwvt5J1NP`~Wcu z_SZT@1S35V65D;QO}M^+7x<4Ij;2gU(*__%%T2VH!4K_E>HIy^S+`< z&ICw{5iwX0h<%zqxuZMp=Rk_{XUpgmr)9Yk#$GSLIZMrh8uX-H7AhsO_byJlJ6n0y8{iz5orTqccZ^MIeIF>1Q-;Kk~Yv8?zco5@$V zi*!=H1;x^>sT8(>$YVv8-f6py;Pe4pbV1_c(@{Mpes>D0kbhOrwz1zCEJ z!t&j@effNcrHWZXHJZ5#kn?&iCeJ`;BF+_{YzQ#Pr9aiDW9wfNgkwMlzZVPuU7Uz$_L@jgdsvHADWPN*&R%e((uY`5;5!hU6qa7~C5RmWA z`p55~SObnV{2n?5kiC1hLNf)KtJsQ-#cD`!^`;L0&u}|~*JX6;0;r<(BQ37a4bFZ; zhgvQH5}KEQavv@-^N6F#s1(e1$d9ZP0L1@}%0aU&`^n4N{8iVh^lA167whKps%9?(%UYoN*NgNf|!^)>3O-7EL*qFdY) zl1=0>^d~F@9)?ZzkultCwblKM!IUtW?96NjPE1jVU3v`!cM8Q!c{~k-Z7#%LUtAkK zPa#%xQhNqOJ0Ji4$#+n3^21gKO7_<}J-axSa0$tZL1J+h`cFe-45MN^4I=qA+ifaM z1$*xCW(vF&K?|iGsqrq^8}`5ODRk~{K};RcI1s^Hm#%Rl%Xq&>g{TTfmMaQYvmak^ zi;=K(u+<#T)9yqyhURh(mvk*e(S&4P;tfcY4Z(t=Z$sZe#6mAlh3h20whYR3fs20V zAsN>&qX&(Wv*AbP`uhjOYoaft_p1C>y6oRxvI?mT)zK=tARB-WY7gb$u%~ zvL}$tCnr$Qw;^WwV`*?|$Oc06pyxp6k`ay0BhDberY@S0+y%(VO@CLX*5+V1-G|dl zJ2kn@eLt9xO+}pv(OyFylEkje3rz#uB!i85 zY#hVV&>ynRpkg_)ug;IhO5fODUisTUL#vG&S+Wemfq-baA-H?tWLNAfPNBf_fghwf z2RTO@Y(t`HFoOHstEd^Li&~g0u-%clzt%Y?@^G2Ho(|$N;|~^@ju)}EHC2X0zP5A{ zrGT5HIX|SyM-gP!G^>_?N>E%9vLhj72$>Lfv!1>J34O5qOB74#1KoKdhAITu*K;H_ zoR;rejbcy-I-?IyJ|Lrw7gbzSM8W z(TCWP=s|SRTBQ#3L1g~q+fp!YqPfmQ6%7MmPRmuXs0lQ4({E^MGfH?hD&!b!!O)9n zvNnN0#MgCc>NExBo*w063WkNZcQ!SHKL=ThD=n@99&Uz}ytC}GJY;?*lO)-t55PtJCWP&YL5ea8k!>kynK_Wlq{1B}QHPd=m> zPky}efnUph(Yd#AyWF&8_oD;MD$qLS(I|9XD3q~l=(LwSTPzVMralx>q2EK}cJFH+ zZ;(PRO!f?+nBg@=jM5<>v9mwWTEQ5wSUS-;%2>@n?cS)pKb=`I?*==i?xSdnIh(#%?KtO^kx6y9$+hhj@=~nHxdJmQ2R3-T_eFzNJQ_#3e zjYboJy>1N*n*LU1SzF0(%dZo)Ay}<7<-L>q@3WzT;<^yBD%jU4iXIqk@QT#yXf=7# zBf;cx`ud)&?F<}Xd6Q?v6x$F?4qsz@GQx?JwPi+a8ap7R`F7kmn}Ct!&wl1JRAjp! z>D}-tZrG)NkK_C2>9hL2uIbFcS;NRtG|mK3jUkcbyPrjsI4y7%U({t>CPHTECHKR~ z9m79lkvCYrdiy%n7+^VI@RIhs2zFdN)}a=#SeFMX5TZg$}fm+W-^$k(Sq*xU<5_!HrfzjRq|=NNvro=ARva znviVVp~}>SeSvFaoJ=FupB&#A>wmL2yU?oJah0CTh6E%U>RY< z0Cmj4=94a0TwVHOO4wE1kjnje#06ZI4b(i)pWlrG9tAL>VQFKrv4P5=A)1qqh^>@9 z-&Vg>20ArLhlVN@2#$|&8R3^wfW)1HL!KTOg%d?REK5}==sjrA8J67N{?dt z44`O#=}Racr;EWdU2q$e6_|L2q2qwJXjr;9iGY0n5|tT1jy5&n>*LM@wsoj$t;Y^| z#sNmO{L+`chH@~3VBXlLfHhPwf<$bw20gZ$JvbDN8{Qb%Ycci|WZq<{ff^0YIuds9 z)kvH}fyKvB3}yh*$wm`nnK!wBM7CvbXpYm7@!|{3!==5-{~-$gQ7D)pNW9^c2RbBo zw_&ybhS{refEqUus>+mblJ>RbHR^oXVZt#t{IKFigh2k_b zC_;+trO$PCh@wRK?598*&qG(oiJ*NDvM_*vlCtBm!{m((F3ogC5X}EuYBv6we&jGDFnp3rTN~}-y-(dqdy12`%0bWIt#VSV!;J&56D=ujW@2)(j21G5g+Im z442d~HEFqz7EEzyfI18$;zOJTJInMQMVu;?Tfc&Kaa&e5cuk7JScc&EnX=KBRublf z7WrHPJKMWDK2il*GweW8R>edu1-YZKONKWzZBTE!{~^`;3yIbg(zXj&jx*5YzI zeYmQ%?hepJXr+ZAGkWnZ6w1iEzw{-Bkv#ypVig+Y;wF;UU!%_2xk}X-;B*DWs5&D! zO2H74bBn!Sco~7SLaEG-lQ*Zo(D@{J^T;C=7f6yHf5ewl-0;HfQG|UCFsmmY*huBf zL097;Hr%!~r`f<+vi38FZdq<&079Ems23X1Vr8G@@7!N3xLe+ayL^H%Qz!tDBTa@% zZxKY!kg#X^B`^^^i}OY0^yO8plU0BgjP8_%LCK{QBF-XL6MRQg)*6tcXfj_z#C3a~ z>8Q*PQE>A7JHLQ-L6$HRJLwmUY93P(+775%fG|7hvwQbZZ9+B1$32iEp@k&peK48P zi!W%{fF*RYG5}bC*nC63K*{95^I?a)6RRd;fa7Gr@psUzJs-q5>{9@m-PCQLsXf+X z%zti=mLBh&;r84lGs(otU7RMqz{61=v{iEF#_U0329_8v!B%uU0B%zcKVE!u9TlKl zFd5-`SSVagKsn7dHiERaEUA9r2`2G5z;qj7+<&E-J) zzY!Qs#y-I~`90p1m4*q}A|y>cDyLD=DMaTM8{}voXuh^I3ZZVNNI11X0yD6)!n{BB z{4bzn(aLM`|sA#qfyRX3|*O-aJqWyB!0L+EJam064(Rm3puq$f?+d zS|3c#_f&I^{aI^E-T1(O(Ij6D0V)qnw2{!rUYyAVi~-0&$5k{pm>AOSpM1!Nr$95x z=sc`KO`({9XJHfg9E`M1-j}J$89?0VRMQ1+S0Rd)H(BP8U_%otG;j%WVcC0tkqBDs zNwBi8vz7-EGxjTptpMQDb=6+<2{IE`#FL*o!o;ftUAI@Q_klMfiqpF1EC zXOA$+)NJg&EgACa@o#E9unI(Stj7>t z4dC?kCVL3J4ry@`Mn_SZfl$aB{)!zmvfTtw1~y2mh12}dKt7OW+EC2z$=6Xkh}^)c z-%IF%ts72r>da6N0?ab;jb+=uArg;H?HHI1c>D?n>K!B+licjk&@jsJAMgt*@(*(v?ORd_b;R5(G$_2<0d=?hrXg!qXN+UpMBX!Sc-|F z{bW7+S@P$s=~4S}n;H5O)yE113r{DS_jy)?766*Fq*DR=8bntiSq2AKh@nh?VS`TN zP;<>XFePdLp$LV-S4P?`b*> z$n+d0RZhW{m5iX1?i2!Skg-^1JO^6#9Ak)Q4`+$OBqkcMixdZYliJ6eCqiv&SB2#g zf=iR0e$1Uj*0}AAs&Qvh@GH7tr^i|dvsYSsC>AYaLWTf39aMrwd|X77pvxfV;hBxx zRv-{*aHHT$;DxT^VlcY)H|*hOvxgc4CqszrGC>_0lJg>O17OZ#?+M}%wHvQYc8oozIf2a`Wp zxYZLeh2)LNFoaSBJJM@c(Y^zd+Z8P}m4QS|Swu1wgY~E#O8~f~mB|jyR3TU{2i)F| zK^zp7B-_xhgQ#>T?=ysI_+zeqsz$?O69VZl`6Xj502#Y=16AX65uW~IHMbqGhC#Y2 z(M^6dR1Bkg39rS=vwbjeYGV(r0xVpLjB4%x0*U<1PgA>b!Y4&f{muyFByIV741U=@ z$1{i%NG>sNsl=T6`yCoH&=drV_tmVO<94ca-FoYImcDGjV3FIf-ByBNj>A-Y8EBn<{62b3 z=-w(7I-xJUOI-$$1RIOA*|;s4S`5YN_QN&<8pZ7d4wUgGaFKD&RG`!3|47#l+WwT& zPUsZ3qxExEU9g$^4}Ko4CST0Yj%csJe30W5l54YC1F8=hnL=AwNn=_nBULu3dG6@A{u~FwXNGF)HDI{_ zI(9A{`a0Dd68T1Zp1NLw$=PkL*`V$JD&!syf8}xe+jvJxJ!}CIVL$%HPriz_0T!4p zFdS!>JLv*@rBW%|CFM)VGa>aolSyoQssSh^tLI>5ZdhPkS1 z1CsTJq=~+nFgB=Yu3P?|6I%psr|)%vbxTGGJ1P9*EnU6nf?akvBl?1VwFkjtn%I10 z{Q^e%5aEhfzme`Sz+63GgoQpR3y>(dCqE#@p~G6KBYSKevU5JRdnYbi;ifDxUHiWJ zoKt(9d_-DP+z8{BxF&v%)3U>~%8t$p%n}9R#vJMbiUq=La=4p=VdX_Eq~TU7f5GtU zcTn?xy(qv>wV;&;T9cpp6iQB>cjEW~&|+YqD|=pwy95!|?%qSyI2S^Q9uG}Ja(u6_ z0$-UYyqt=c2uH&hCr$} zc}R)@ODB1&Z>K5lLO^r3!Y*tNU|G4`_rdVUDUuyo#y`%&hWVF)0fSO}h5!;_wO2=A zM<7RYGt3K(6UGiFEKd@?#wZo-f)U%NKZ9~{lJ8hXKF95>uwbd;K7#f0af(8X6Z}q66ja18j)Cf- z!NvZO-cg>W7@md6;d8L6=l6J#n`!I}$P)R1E(JWq812(77UO80FQCIlX2^IBBF2*s zWIX@UFs zBr83YwS?Ohi}$ZtUyQ<6b%3p>RLmM~V$<{ z7YxyD3Vt&?<t>5-2yPExt6(cL6{ay3@WfM#z+8V02{bN3()fS;!i zR#sQ3#lWl`M0{{{0l~UIKHgL0T(P}c4w=QobfNb zg4RLEnInqSr@B98yc)+S8bA`F-5@UYJWU7o0mK%7?57pFHW;yAc^zfj3uV`18KMie z=-!r%I=XEL&UQSE*H8WjEau{W48aPYev>8{;SQi!*t{fx@P`T0$$>}d3r4|TRiijg zA?`8xLjPcLO1ne?rvS@kndf15v;{JS1pg1()N!0no|XQvD>1X@9X`FV^9tjOHdFb*9_Q@pt()~-q>RqgaTHBVy;+9@Cm^>B#O)a$%i!8xJg;&SWvG$ zc4Wn(g&RhfnF6;>CJ8i)+Z=4LZy>RDp}06{|EmYGVxQ{9WIqMxKx#bvbqvKtL6cUh zQuBiWD0xMVv0bcTTBC-hGm8_SVe#_FqIhK)<{o6kUP>_f<|1AspD0gmf@i_ zT7y93Te8!s`<->LGjzal3kjzI#WR9foA}5TyqO}pZj)`TM3CJHQf))9J~uQL>;Rn$ zQldC_lQ$<1(0+n6fM`F3K6rBSEowEGJho1LKvaViHU|Aepk*9ZNo8-70szq&%}(l% zO;8WRS~B^XK*$*CXdZJ4MoL#$H=W|72yCeBKTp9JFN0_Hi`#n_xGikj%>BK1R$vaz z*}kdC*Ci0fAIAf2x&Pj|h1@u&&YdQi!bcJW9SK*~U( z)hHgF11x}_niMQSBto^nwyIi9p=_Ze4rSCJx{kLN+fj8$E&-c5+KVO|P!IalBJtI+>T4wJ+M>eu^ILa`JfMpbRN8*zc|LN z?9)@}%=7?=a$cjw8`{~T)fwSL3M@bJ#TZ~IXq)FL3uJUl6Np*;{7dc%{3&XN;)3y% zkssepAy|pZt?T)D3dFXO77S+)oVA)p02e^Ab_;dkIS0Xpo?PwE%^UeKlnsy+W z({tk3Jfli<~7xCisso{g{?86jjj_G{$ zN=GbZohPb>#EZ%B460*qV^@mdE)B_rqYLE$=Rjpm7Je>qlM`NsW|WXS3y-*OmB%fq z$BB}HJ>F4iEaE0}FDTt5oaUOt52zgIns*~!u&ekhwu8~S-Ou%oLKU~`f$nN^DMhVT zEqzSaPBpHBEx5>?f;1e*->qLq?YNyZug;@(pmQCT1%4FIvpe|SI~2eUZflTXW;E^) zQuGhv!T25|H+PSSu@Cb742%wj9kd-KbnHN~JcING=@YepLy+^8T}{`pa|97#U)`bT zk8xW-wYYjVv0pTir?{P=JL{ZSp90M=uNmnI>iJK7tAj){u=%51-&p5`npi)%NYT_~ zRA`>Qs)xzlOE7uNF3;xvhe)K($t@PFsI++q7QuarLjeR!I;|zoB9IcnsvJeU1Rz^a zWD%f@+Z8}#L(Q}+4!X{JZm1sMtZ0sp466``my;e})?Bp)v{bxkP`9_vfl0;Qhlua3~_r$`jqo5wqSrFX*q^LxO>tr$wijbD732hV z>FjNyeG1L$Z|kMYbALdlr(Z|cAb97*CHsHd$oB#Qs=zcAxttrE9jnk^0*I%!C5tB1 z=>IYJKKXMP%HwttG5#n3Ef93CqFqe>aS5~pBn>)j+NUzW)rHl~r!h;dKtgRhEU89* zm(u44VSd}6*VeReR|7b$`+LvTk=7v)Uv?P&$GXVafaGGJlies9j7I9K=s9^EZ0Ytz z8w_gCh7a*E`P7SduoVI`GP!XT{4jnqtIN}l#aq2vnmOXR-Q$T81*jdQ~ zRW!F_L#qI?s2O`vJd5D09_?)(>KYIg+!2t|SeGmVvpT-NAJ&N~P!QUorYlP+Fejd= z;6PG4@Imof^1Lne)PX2(J@n8>e!S0%&VkbTo9G;;)w=wIwNRAq@7BV!R0rT3O#eu2 zNEb|sJG^L15+nGLttUV@U#&5I!--(lA0gSP-K;I6W(Ro(=gj2h7t3fEwsCCdfZX#kWL9IvwWYKcs$9tH~ z^fkKy%0df5mSxRA+R6U`OQOWyfnXVI^SBt=1(O1UUo8vtK&t&OR9pH6-bp9Dd{N0r9Xk0&+BeY?tckx9)|Hb z+!o2%d(ut;jNZS%5`Gbz$|Gcfoq_#h<*&!tjGEqb!Y&^?OV)( zV+VBTh&(JBrBA|o1^OIAL4vwJ$-yK_-NXcRLma9Tr|BE&ZHy_WP~6t*`OEB~iF1g~ zJgwNnFx4=FZG9iyu(Vf)`eiz zZZas=ptR>8YO*w#piDRj^+4BB)kGXav3PG?r!4pb^mxFz6$;qU-r32H5y&#&`8m(c z#*kR?1-!*65^_}qS?caJ!=fED zr_HhgY6g-CZk3Uv2EY+d-&cumrm(RM)B;A0&==41779fxJVcuNMax(*xb zxY0!QAgH>?cmUc4Who7m(lOYzm7{Prgf@T#XTHWi1Xx7e0*dokBPeA0wJ)IW^zn&sMYoKEVkSXG_cYxNp68Yeu_k;;zN3}neLl|#Ki7YTJG@H~g$h;+}K<{S*q zake@uR=NLu5#&ZNnFkYxEOdJMRe(UI^g;ky21swWQL>#^eh%&0sqKvlPNyh~o@f`G z=}zUK>X5&9g{qr;7$u@rpvA>VB217s9ANQ=VKW6dc;X3W3ksaQv8pZfHo)q@>5~FafrwSZy>t6exR2Dr&-@w3Y_wE&0kVSG zCQ!AU+nZRgq3%l%`C*-u>id6$a5U|s_5v&k-L@>)>&;lSECRBAwZn{zA-PuVN>H89 z`m=bfxC|kxH;GnoyN+rX#p^0d=>v|K>I7UB!eYp#JzA&``b(M1w5r z(-$}F$6bx@YZloATA|BqEr$J*R`zvv!LbeWGL+hTcGno)l12wI!hAq^@1{8DuUIZY zQ$0u|$v7E#_5sf3CgTYtcnrZ>yU77T^zP7mVNGucPTo$ zDQS{uaS}4DDV;(iaUBUn=>S)aC$Y!|B);+$mVsut%?1Z)X=^UPWNDAD<~Xg&>#WRO zf`||!qA;KPKNq>-0E))1c?d*#U(?$HZZVH72F6xJkjwJ}j_7HrtCT3LiN=+?G8FjR z(KAERI?y~l+*9AW1U4f&)m{aWR7*W=4R_Y^>D>+FQ+HTaSEJ1aS+iFck76_-kq-{R zQRZ6!PI6&h1oP)xFYko4p<7$L{HRjefnW*V(OO^^2#4Bnv#O0(>5BvR)AzS2=49zO zeIIvy2lfvuNO3E32!V`nGM3R+STi4`D74$idh!?o`No1GRU9`Gp`>Xvc?z<;UU0|` zb%V_;sZ-B^^Zl`%sYc^!@1og##?b8(XoiJg_eprb&7Tu*SAAKtrPrhLy*f z`;BpvGA9z)zeUwRr+oHht@@+x^f8lgc$sFnTbD2Er`Zo7}N6hzQ>mm?{-h`y5S*hp@74smJGjZy zo!s$kX`Qv@2!8cf>A!J0c6J^QTL`U_VmMSW>jJGECMb+gdtkJ4^5+EY<3`xpA4bXP zyKAzzHn89Gc*<><{2u5!!w87S%o2IKXD$x$fW@DRt{@jeNQ$kpLbB|?Am;v9^b3+<`c1wdNe!wi=al_*EsIhZSO zL^)Dxeb+$hpQ|ej7KPKav}aYSx4N0wxNM`ZC9u<+5m&8LwdI;+jZYAQN(NDUC|nou+3Hq6|NB%Po9R(0yakx+Z>b}JY@>;siH z2J?FjmTJ(sqmRz$=|6C^br-Y`Ca)SYG=I5aq)2988;y?k*x`@TK1@=htpzy_gqfk5 z*G++uK{i54M{{UtyUsG(g?!E9r{-cnvMLQS_C($1P(lscprk!lSGOz%+VN}i+M8f> zbi_ffTDn<6QOjT1#^ME2tmd@UM(RjJ=!RtNx^VLSw#_*f4=a-H0l#YjHjSC`Lm)i7 zRvqXgu&YUqYdXp;{^Ooj2N`nQ_9=W?RjLM} zp>k)gGiaAk)KyDqjH;qsK?p}DrFIR%k(=L||KuVlvGH~+;3km7xCXT?rX`dxT;4*p zHbd`8p=_*r=M^v6JHaSrAD6=9|F<+p|-5n!?ug} zIFM8PUo4l8r=XHJuUe9R4oN4k4{i99B-(B<+6W}k!`0Vp2GiPmfzC~R?Fpm-#7JfA zO@p?45my$<2C+MRUN$@Wi1o_JE-1=yd#qZ%2kt~>o>2N6&bJe-N$+EOfS2_gh1Ho~ z`zA#vnzKeUEF6AM)KqFNu=?gYEVTgQv^xcV36jM80>$>9y;`!n2BwiZ5_gY91f>|S zcNuRAcUsc9U8wD~r9S>aI~g>&j3A>oy}>QH6^MBGOx3bEP`TD7N`fDMohAiScpkCs zqlYLd$$?t)ew4U--)6!6FUOB_sCf#eVmWTyJ+}NDEwU|N1qS?|wK%onbHOrb8z5{% z>r`8neKVLc@69bN0|-gu)pqV^D~$19e3?@7sr8EST4CG)DxZxVTl2esl%&>MYmI$R zem*x{8}-9LD$D9ERt%Br-mzgC6;3%^edT%Ny36x8T9~qSR5Ok#gfQJMKby<wkAKy0jNz#mIXeHvya-t~;ETgz;QcD9J2C;mHV~ zsW;z^x2tIH0%4F39~FamsJAzWou2!T^&sU4_OK`HH=k2=9_0&d!nVO*_&M6{@!9d> z@+pk5kE*48E}YD^)dfWxzE~em)UM9tx?9vnfBrgy378bumExl{eK@VbaUs-fxD zK6UEnvKgx}?~Xpg5sIqu-Y%4kqrI@{!cEK#AdX(QL>-hKMgg^e+6)jJN z@scZgp97te%d7bxsSW>m`?;>tHAn-bKlvMF>B5zMTkay0fndcrz53&-0C1A3>p82| z4x*HherN|q=dq*zz(SjLeHWZ6WVyTqv!_qNq-?p?&4y7l9!E4^qk>pzrgKy{HoxDe zny1ox|8d(-p`$63w76{rxv3s0dZOI-slUhHzEE!-UFs8Yb4K%2z0y;Tm)=zjpqH75{Gcahsjq2O^mhv+ZiD zwfxGVQq+e423#dnUHu~{LjKW%-=gUb{ywQMQ+6OFovhvO=U_zgCO_qN+w+G1BKkI$ zr*9y?8@p?Hz5vQPoD^vN6H)SaZRDAA12({Hn}z+E&S?`c%B+ zDwVnZ!?&1~(k&q1qKvGT3NH2WCnsu(ccqU%V$_Rp4WSI~yvnmE{L`z=bx>a|KvmLu z@MC7GT^6u}68Rix7ZonUibBE+eI;P_{kr@5Zh$E@)!heVb>_TSYr-k;Yvxqc@DKst zL#jZnGCl%TiG!I5J`SSvZy!+wAnE-2Z>pltg`=8sw$D!ShX1m$^B4T7003S!uI2el z`*4ifYpAD7+XE=653Txa?IV0pmxo|&5GB0UtV9|+5HQeglinrFTs|_4*&`V7z4KFx zlEbK_=jK{78sNHXgR-MQc>iiq^Eg`AtBYmIbz&-?)L5Llf$-X9zYAblZh4H;HI(!Y zwd(_`I$S|ZdR$jp?|!XM;`MUcT?FyJ{S}H9j9R^oXCSczOP$m|P`YI(je)FI0<7}^ zcY&_gYxj~pXwIAWQL$Q=J5|HC7T6A>>R%p$$%{K3Sv-QJZsNpL9s^ZTYF*{2FyY60 zt!mM8gjlr6XSSBz3DG6?ZacHSF*q+_NniA3@tGt3BvwhturEC4~6S?hj z5kXO2sXNDS3Wt*;E)%2f9vd>Lc*}64@Oqia3W${n0=X-Q=h^eKwKH@d3KJi?I1pEB z`MGnueVCu;XRW+8w}?jwR6XAITpj}$;yq8*!eJ;23D1M;f~eN^ZTPQJ+YMs0uN#9# zs#&69DAI{pm%G3~VsOy>KC~5*ZFlY{z`nB{W=G*(tt^Tx&M5G`2bRnO%6SP&H zOK?|>JpGORD+tlX;Tn5h1F66`pwzyY*bnN!P1_v5*~gxp5PJ#iDjlPtTDY@F zALWBKyuJ&kUhdU1+=mFSrVR9(ud6>cp=>879wJ5jr>f~`7i2y{OA)MW))fJd5mJOB zZ2%uQZ^^0F@SlUpaK935_^;Q|tL6e5A=x`b!NN8JNtZ#@-rN8}N@pvGq__5XcCMC6 z2YYP14DW#D#51j(C4?f2ZvBtek-{jyDtZTHs9CEEZcH0S5xN$`@V1gXilQ2yuNL*V zu;iz*P&}27Rj#+rwpu)gki>uK7hQw0(8Kp@(s2n)zaM8)p1*C6_zHTsJmQ$gWOGbM z3Kw-+rd%c#QHbd$-&_3w^9EF|S%bi(K4$eoZRmy)k&R8(aAPH3W}JN&P+n7kA34GF z7im8PWw7{C>xFCm=O&4C_Ymgz{)_*gKh3%yQrMHV_xKo0DGtR2htVV9*~Ppn`SU*d z>h}!-HvBj1U(bJ_PHY4TdHh{9Vsrn~*>PeU0MjgcV)>kgsixRl5z{ZV(GX5cn}et_ zG?l8V07Qo6l2(%0g%Xa>w#67Q30u8vvpj4LBVy!8jh{w^iTM>%ucLyg!S#g2aY#a> z_{Y^)Oo5TOt0;Xpm!HYBjTRTcRO%CxXD4gch9I$-2<8eDK8?j&uXPPYdIycgoAb~j ztXNswya^;VJL@&OTS7>w_EzT0VDzBO=}K1m7-Jt8Xxs%Au^mgkFC63UHhm?yhL++S zXk&$k`P@Z@Y2!iMm==-vAc z22O1Ve04=Ka;o8x+$sD#pVhOf)jw|dZ(B~huW555kR;m;kDI}KJAJmfmkc08j&p60 zxV4Y4`VZO*U4wlDmprlL9k2w4E&P=pb_IICpiPYS^s#d=X&6p^r^er}6}C|XVz4+O zt)oEHsyek}LB~-@%yga7g`>2wI_PiuUIvaxazjH@p5QP4GP4$p`fPu8x1nA^OS^H{ zaOxU_?e_c&u`Gh?4N&=lH}krQDD(DC!jfNnK1puwfBZ9UP8kTYU{JMuD}?F(>5xGsp$kjJA=w(o1zcz^ zWX%Cd0j-3?BB_J!_@YKKgJq z2amxdZM0g1i;S8yNt+Q;J9HxE(KT!x zKug6h)oq2UQlAzc%;&lc&(zlX(hdX#s5x7^iFp^SeCso|J%Kessp~z5!Bjgw`t$6c zdXc2w##~2*Q#{69%M{1+!1xtnuW^|^VbptmIwi_BQq2tbF zewuFs#|=+g#^+XVv1MW3Ot)JsHUr_p+2`^w8py9!yk)l)ke#bNJvx!Q(xCGrj!#p1(x{)U?yGX62!!(Tw`y#bZ=1rH z-0}Wf)kA%o%Fohrw;Z*DVP5sCG0Q?9Ujt96m*9w0cFXN7VISkr>K)u%gR8Nbn5?17 zB7}9BwYlp}FzK^|TFzcV&^Ua@HrBFW>TZ?0?Ln^~MLEL`rqebowf1^0Vcze5-rrjH zHCXCZJs-4NWF8_&eZP`D5=`pn&v2@&w%{>RH8y$|gQ2HTDxbZEy3?{o0nZUhl0zv= z{~y*hQ}bF=G8}9~OL_Q6)Uxwt1RNf5Npa1w1_FtfYZ2{(l4~!34+=+;@4IEB>cS2b zrJY=V4{BFptITvRoQzx-S~HJf1a;0kVb~Z2lyh{9z6DcGo>(SnbZ{IgQgpSYDfbkd zoIN7cj9?B$Vw|u{{TF~_<@MjPt4sOS#6iAS0(e&7>KdT5iuXbNw%Egcb+P_UFsFQv*dZMC^b51Qkdo}db`%_p_}zV4PA0<$l*8{q zH68_`%MaQuI7eamI4(!e!o+bw!!;#{85Ol&&YOeMrg-bEyIghwEyA^lJ72?}ODN%E zyn28upq!fja$7Hl!xz{0T3uM|kb!9<(VKhFJA-o07$xTW-sm8x}DZMx?S155vlZYUTHTAQ(;>NyseoBiS$D{+A0`rxR|QC zkU@mVuMW!W=zr#lc)^HmS5G){a<*P8+!I1sWwxP)5#)QiiZLpj;?{;^-DX1nXvyHs zS1BLrthk!LsUGgFvHlzw(O36fSL-aGNPT9UL@o(O2ya-Mqs+ zIN8>G=q4l?eJRxo6G2-5YJLFAA?POF&4K~0OE zt66P_{vjL*pL7)*wjUu#ZLZCh9|IWCiPEsJl=qGueDe(|hSKyphcMqFZ}^{EEqd*L^q4X;S%6c3 zdMl$%94_@yI1QI_VRiQ7pP^d8uWD5qsYSGK+Ri26?PflBjXf_3rjor^o{q~PRxY%3 zSAY_FJp1k@=DJJ9eK5MKOZ2oMYX~XT>2~AHLzw#Z9yvg-!eL84R@bp;J0GKEl(zST z<-ZH3N@`F0L|!wrqiHzo?JXao4gX6k>5;vZESO&oRnNQ`>~e2y{y_ipzS?8onxFT2 z^JTVHdj?@58~i2(3wB$7e>b=2)`cCr65$PXZ;zmY*pQ^-gssrguy8nV13o1OHt%#_ z9Z$^JdJAMqILw`0{k!lSOvOlY@E_TQSE>M+YIwt(X3G2!{90e{UB;lr~i> zA$tPruE?4r4TEy+ZfY2Xl2NUoovWeIQKSe>i%k^AVFDei`R0^xl5Y0`&Vi}(XD8q7 z+~*b&eSa;JUJ}NdeoVJkg}j1*?FlXNHNhlhCxDW{Tgjj)f zw{}i#5v61OuT@)r6P74JfAwbi5}LB^YfJOXK!i3@RcHlNx?lK?RoT1wmW1*ObTu6J z(Za%!928i~2TqVubiya=p`b^?5yk7O+hZs;oC$0jc0EPJ$F#RLm@g1MPSn1|hX1X7 zZXLebi;Y1zIi%&?3?=GZjc5lz$$XIeZ~+FEiHTgKJeZHI=+&%r2b2PgngeQ>b|I9F z7;f#!$FgxYI&+fYp7d=)!DdvWNb=Ldsboii^`5X^2cvbAs&+LNXzkDx0+Ie|)%7{y z*uT*yjbVQwKfnC)cIp-;wDLhB>nr(X{IoIUHDRfa_5K2j`O%Zulq?*9IHbaQ!SH0w zz1iDxzOJ4iGFd@LH8EMh#9g@Cs%Xa_TRhrE#*h8cHphY#Wbc^1@?n0dBT*Fp2r3n{ zeV4}ofxc~8|5Px-t_x<2#-1mlJLsrl!~ecs;cKZ{d#NZJ(ZZOs2R_&g%kxC}j0^}% z_*I{76_nwb6SLGd2qYKVl~qZ1+><`K%bJt9Q|qcDy{5 z=@ufwnRaA*4akV)+t#*^w6=nhQIn{b7S)nRaEjyDV|D6}5yJ6ktpz;=Dd;z~anA+i z#gyK9gQowF^>~MCA$=p5biD?l`fP?%Vc$AoV$8$`P%!r^gD-1`w)WAzFNpI&IBe8b zk%H|&z{dFM-*01~T_7d3Nl?83D~CNHVTTWjJr^Gh=Od(7%gtsK#BjSMgjqwqQq3BV z!{JaB<9@DdMxkccnoxOuWwyM!Thq3Hk{PPChk7f5t9#A&E}@gquaVFd!P15cKQVX> zN`{&|-W!WB0oavc{5OFr-$&9W=A}TK@2jSA8BE){*CBI*o0|8po{AZoGC87lEYa3SH4}!m+0}>Tvi{ z;##K@cR9cD`?3K}!C+E;U~{|1?RQa<*!c@}*yTQ~ywkawd#(Sqj%Kp$9-nLu(2wAf zof7~LF4cz2V+4gBI!wue$;&$^ZQJ-c3P$T-ZVgj5{GY8*JQV12BhV~^IyVbfydQCh zx=OoXS|!T_obGFdVk;Up$^CU73>HlrMpJuD+3qffU&rH06YNEs=?KU)8fkP+s6( zJ=x9v$AhJ}C9qo>_#xY|=2}N4mQkb42fx#7#hi2{AI3k0b%Szk+gE>S?H|*2ewWRw z)rYQuDZy;pad-$T{cK$+@~HpKOG!-bF^og!(@IWHA!W6U+FY`pgY5pBzr?Kn?^bN9 zw4!YUlEGMe^kXv|^J)9QmF@tNbUT0Wtx)8|&823xgMA9GkC)HS4p>h657Mz<#-Dq? z>L_R%S%TeT6~j=}W?buZA&sMiCl%SU+fi7G`>Fx&BM*5GuiApCp28)l&93H9sCD0L zZ8k0d1orXE+vr<3`sk)&EcXgpiet=DOT^a@N^ZZG#1=ukj+L(qwf8A>8;%-_r9K6R z8LH=9Mo59m%dS3KK~aF0e}=*ZBj38Qygg)aA4wKe{`IfZc#rnKragzM#?}}?LXW^m z#xrT~V>sN~u&9Cc)5Mq7TstcUh85Q{&e!Eeo(L>fhK_t~!wZ64v)$&ACnUJBr}4Hr znd@UVp@n6mdEJy+vpK2kd-H1>wg@JReYJPSQU*}S|HOr2=*9kH+Y4se)l?v*RfQ`1)P20`hzck8ojw;e^gxy@+@KoTd;jT`sLbSE51c%QU}G`sS>-;g9!849Pc{F(5je$lRZeSUMo}^-tPYx2 zM7Ier*2g?$Q?E5$J*5p|iThYjDd*;D`i`ThuW#1KYeHCotD>mqKnmE6EobuUPkyT$ zE9L^m-@EtQU)wMbM3Q^!!0`eYmfM=DOfI6NP>vUhaS2u_e(C4Fw&Aj%{M%0V72#yr zxes4ONJ`~&P(!h6iSBgC*EU=i#%?XBavdGA zo3Ye?**5q;0qMzuQr4|ZuWS2Y87+LcGpHR~xs4+43?EhisSVY-4mQ?jhl_&imHD_C6T7j7-`@_U!F4o-crK@2FjSV)-!&hF5=&fj$wIhl>K+yUkE)sr!GOv8bCOpP@<2@kA^> z4;XGx=q{c7!`wj&Cg5<@8t9%1JBc>HX= zbh)MfX^=1nf?O(78?!G$6xtT$S{u9-p(}q`=QgmjqM51=gZWxEt18iUC~SMhDNp6? z=p!6B%DAD_7yiOE*V%c47AW!?ypRX56T`Oags)QfU4d(wZx{9CWx z6Xvj@xX%!j&bl==o&(9z15k?oC7jARTGPo5|MWWB-DR*ZAat$cxC=hXVS#~rO{$8In!gJsp#?^jc@2Z<k>JmvKWWZSI2@g)>&KKA?L3S|uF&D%RgBlRFBB>9JY^I|h=1jW|Xw z+*R0m2ZhffrA#1V%Z5&MBvV1H+I(%p4Ad!P`dr(Xo$HgRcDi%#nn$cE-4=fr0A^iY zR2KzPp{zt|x=Uu4(8Au{lIi7qYLmWZGJpc^pS3voE8|^m4DT^M7W&B(sig zx0BQ_AmN~Gt!xy|r|lX0P5J4qkumlyoEq<>5ueofm~26f{tpoiLx*K%JSpcFft06) zs>a_c^;R@eeaUX=Ho;`}EAg(dS(ZvK;-Y#m6YQ*mphd?!L#Zw)tC zdk`nEGTYrjZ^U0@50osvT~%%fOiI=4%4ZlM9KO~+iZ+5KZ%2Pc7|j>uk-hf&Wzu5^ zUH4d5%aQ-lM4=Ehq2{v35M(-5>zqvFIH+8$Gnf!gMLOt=db?m{3W-2ov6ww0m^lpx z%69KZy9H$qIayjOY6GMBd{cLDF9;^Z*>l|8>TaKlNHXLm#KeWu%#=5Vo9xQ&FC#__ z4x?4Obp=6@-DjX7zuJG2o5kAv$Vvg(eFEcsY_BG3SvbluBkV5uy4}am(Tf#W#wv0RW9~q` zd8u$Z^`6q6F;IE#^^{uLtLwDxql5=eDGK)~9Pzy24htGyLr|Vm&2jQGK#p}(S^5w2 zZ8vFS^T8^^w=dLU(Ic34>k$4If?;ldcj4<}lw{-xZgt^L`Y7f1T&j8+id~@j8jTb6 zGZf6M{JGNCI7t|0&M}J)+01I040sOn2=mvOx$q;5QYsziuG0r>xRQjdjo6`?CPm=gso0( zPmGf@({m-t9bjs|8D<@_(gGUU?d(5Pds&5icl9JOPc?7d-AL-q+44r)BOF1Wn9Kls zsQN=9SpC{40`A$oH%?hS*lKj$QF65V?T@@=@{gE%AeA4k zG39YkL9@eKJH5K~#K3%RA_DRHLFsoUv8y##w03jR98!c{#h-_8W*&ASi~9@t`s4D^ zzbK5$_4`LJ1xoC_TBW%RM&t(;xHHzO(iNmAX|TP?eYKCGY5g#kjucVoVQUz2y^oWX z8(c5;ao*s#NBK8Gc#xN|&3$enq!0&4^A<#Wbr4hE4s>Q+N9;+3+2?!W@=$I0nl6Xn zx`cPr(cDHzb<*wW=s|Y%s^A?#O&o#$UiFgN75U%oqfgh;|2>e%xk@m1VFlE1anP>z z`qL1IO&eJY>T5m0H6pc~_ZdQ1Hda_RyQ9>HXsNE#CEMb%6#6-GDxU}HKB#s6M=0TZ zf6Y3-0LcQIhD(nD#du9?_axCSbi15SK?0o5J)h5@;{PA0tIrdD<=;~Qz7&?&sdkCO zhJSWF>w5m9n0Wz3@_$xmyHPM)@YrSYU@F1~wr!Ywe%85e>-y99(^J5DP7P!*U; zf74`t7{aXu9o@Kar%xVeZIc%zEr?x%^6?+*33bp>36COVy3z){8pE0#xRA5yHn@domDds?#9X^a84L6wdrF@s;QetP;5@Z z8jS@Y263t_qg{lP|ADhLU&rdDKFa=fJ;-G^#Wnn$siFTBl&EW}#$Z>$Fu2~^N#$Cf z3T?~ydcHAzYpvuWNJBqwK5`?$-~5=>CwTE9a~JGNT_zu`FCj+G`!2K&AptX^;jktmq?vGTzaVLkE=ZK;>N2c*!cw2T&>k+j4<+@p zr7Je~Y2wb-=ypvQYt8dxW$lN0bA)8$(qSf_xCalZaGwiD=SNXGfDkAh-pi}HPhJ+C7;*Y9@mZ@*IN|K z{VUAviOs{Q0Ch=doXB>gNztA0b-Z~Gq`V`9OOuQ}>M`Kb4C$C;ax_5k0qXgjYv#*)c4O6Kz~~^{XG7)9AIY z@Qv!9uEWWudXREwUqp%KI8M|q%?*TgE~)z_h>e$jy%yetWu<{-r%UbRwy-qMT5PYjW(BO;;2}{Q+=0sIJfT@*F=eRB9KSRXJ zWKD>l3oGRL_C&y!fV|A(>)q=O8{1F0IP3)=8QX{;yN$3kY%DQk;gnZ-tg)7PZbl<& zS9-SY-7qE(?qkS;k^0+|Hw2|F{>f!K zK*Rlyb@FlqlI2S;mviSR5FST!@M0`+IXl);;!!A)YNLa;HFXS4;_94ZYHS)zp=zb8 z`F2hqCDB*8eh}^yf-K$iO@lM}UIjT)Gr_qYw!1Os19NQ*`_7jhp>RO{B0%UaCTH5h z$>TkDiqgnsgw8}Ka~aDOlniaUYin9%x{69@zxHdoX~D3b!Mg%qN0P<8zd^5y!ja-= ztv%n!Pv<6$-fs#=E)#WnwJlfQLQ}{Gmuu5)DUnZ}QTk7WqqYym&nxjV9MO44Q{7nnL6mfx?j2C`P?4P{-f#oAGo!#G)#&w+I@yDH4+96|nk(yK-KB5_Z(mEOky4U`YX zxg7F%g3?vkHt~F#7*vn{_Oj-)o-$cG=``>hg?>i&>#_x-AE$F20WTZ>7uE}SpiKS+ zP~^IEe4`-7_^DO@z6nas#)AI=^M<>M?0}5k)E8Uw1#b2-i-G=&de4A`2iLKg)tcpO z?NcBqbNEumHZ-;6?OL20?7x4rO>?&Ax5L%A>=34w6WwfXCm5j~s4=&W8X?yE6N~Y? zA-r>&)zIQ|Pkud;k;@R2dd_>BuL1K$!NvR4tBxdm@W>e37ABlGMHv%J2B&Lh97-)| zSJfQ@)4-j#^xzah3Gg@~CTf-NRE>ltf~~*H2cAkA8S*h|Fhz1)u9jrx5}$B~=jR#I zJdDZr>hggFVKwaBc=dM|fzq2dz)0c})D4{sdb-!fFDJe$m1KM+z}~hN@G1~tn7g$b zZLXon$8BqjSy(yWt}BRWa1l&(BDrjI-N^TCbFa3vSvE>+!W^W$gxZMx7OGUwt^Pfx zEgV_yCD~7)RJ1pmpT{z+0N?njKm9mjq{Hc{q`QI;4NO%VcqeeQrly2&7fh+nO_mcV zJ5L18Zf&jfKA4iss3ooc`V=V@eAHY=he`))Xvww>aTz=U*4b{YIzEJwjUB*N4?jmq zVO*DjuSbbrkA;i>MNpmPg&srI+%I}J6{-0-Jb^?k|I=mkM}nLnX#1f+Cu>;N+tR-l@ss7jk5=^Dgz-t=rd zUos@R8p!YJq$^!dp04r6i%1kPGJfVnje56&(q5?QvQ4-vlN;m^2J@qpb3<`q9C?&V zd3FF1tp^dx!V%=t79YUfpnSgmbzSft;b_j|;NG&RHh2gvia0-Bdp5&x zc&Yh>G4aS(AN}#V8~A7<4O#WGHsY%>1i7589_y$uhTm@Yj2#0~{Z3VrdmIe^4=z_f z!`e}zA?nF}>QhKq*y|dpoNb>$2#lJUxx8}q*k}-`3e{tZJVBFFSK>LtBEAt_CU!sFzn1td8Vgr zm|X8aG@R*Qsaix;i7w=x?Hdq5Wl(<;NORPUd#g~l5W^i)vm~|D<9*JqYZ72$cWa5h zURc_0@hro>4QY}8%+F~71Um`qx?3Z2C*L(6$h*S0D4$E#4@CQ?v_M3CKfhWWuHifo zc81&j@ERrzt=ntJ%{AFJB6i)hW-6Dya?|f#R4<>wfr5ST-RhW0#bc$!$gmzh(f+xdkSdcD_kaPSSp9yEXvA zgR{{Z0O451yS0V4Ek7MSG{XFa-6#63|IeSczz?avZ%?0VZ=UWzk+qjEoT`!P&fr{| z1@D68Xz5y0Agp8`wKi`LknCTtiarFUOrzF;*v>FQ7_XjaB&gdHAB7}3QO-|e!cm{+ zLb4Tj6e&sXE60IjV5D=z3l7?V9*<1d{gV@L3N>nBt{VKQB#^6%^!R~wsIjCy*MEPw z?O)8p;lO5~Hbg~TKuYD)9P0+F{k@2$gl$OGrsbCqlwzuC+-2eLIN?$!b`KL(F2J}7 z5XG6Cpu3jOKB(#4b>WnBZ<*s_550(C7#~O%%?Ifwm^_?#lE*Dr9@arw?^5DE_-)&e zKV;ZoSL17H+Ko<@QKKX;6jsM|8zD(*V~BPIR-{wqqkl(O^84E&)ZPBe!+9|LUQc+l zJPjCnPZ)LU88Lr~pc3ylFI^K%Hd*retdBeE6g}=AB1DC=r%ttfpwIg#ZpE(^*hdJ_ zl8$V;J;U@x|5f=jR|D}Fg{135J(iw8q&s~odznu`1v+=3O7{#PiZ}pQt$L0S9^Krc z4t)t@|LsHUd(%31?MWN_zJLpq%|)rwd1*s*Y_U zFdFdScOU#V69-ZpD>C@h#iK+d<*#fxnw29ZvHi4tJl{KkRt65F+S)Uz)rlztd|c?x z-_GRsGSzId%ps&;1_0vE!(rg0*{?RbPt@spAMu`WcyI14#oGNu-?u(0_%zW^uKs|@3#Y0au0iN$Uww`B z=3oC1iNa6L%$76O=Rr7`uygS!U$@h6Kh&QG`f+aBTI1uyZ8z|-awtMG4XQ^fRHmni z;e%RLdskfvxqj2jgU0ou2ymn80j3C$ctS=D3K0l zhFSBhE2*j2BZp&wwc?}{KyYj%wzqJ4tg<;<8W4M z`D^^cTBvB-VHSud{33@shNlKk1ySdqwXhi|6626jdYMDO*&DC1bV0m$-9i7k5c=-j zY`dek=J^uP*O+>F4QrRtB9sI5c+i!=bKN%H)j*S4S2ZwdFdQ7t>d|$Cl=^U67h8lQ zQ)^$%jBg|c=kc0YF>5$HXnrO=z^wbfg(lgykikz|r;ZT&Hpi{);;I9mAjS}n)ZK%% zPrckH_MkjN|50iAHX0U;#w?PzK4b+YOpdkpQ|`cu>^8M_pWIzAh2j}d?+x7RwHv~t80oj{7dWd zvpbOK%nL}7%=mldVz3d8+|SmX&1aft>?TB2i@3178Hj>iq1&XprBA~#LNNwl%oyP4 z&xP09X{~zF+lq*J$A?>r+Y}F` z;fPm8S$V$V=)0Yb1F7Bb*NYEBV5(5vFh5-d8vZIqIS-D2;do;8`zJe&M-fwKkN>qc zVXXgZvM%j73Wo3F-(c7Eyc}6Q42jTM-&DRr^FMr2X?9d*|Gv=F2iK{;IEgj%M}3I-AVDQ`DuEx zuCKh7pUlY%U56q7$4pv%TufwRWX;))M4ohA3yI!@!okR--r!b#w3=M^^(_TCCRn=q z1WLN@lxrKD%P2};%aOMQOG-b$z)JVo36oOYb&wHNS<&6Zd$%pZb-%1G)i~~cemPmO zJ{3+@^|D{JCu=C-bEqbApMeoJqqEmnQ68cZ(4@5}wdZq)?98>bV~+wf(-iszkb1S> zPQz?XqaPzBzx83(6BPMbVrlD(Pr=lg14B$0qE@oiQe*dd|G617k3+_s7+Qe&wwAKt zUtX^W?{d4-NyEvIQ&yYIZA3|lwBCPE?+WUx(a85S;cpg9yJxNSeMQ>R6UN_fH(?GS zbj<_4m@}MWa+}WsWLuFa=|VjixJ{UR%t^F8gMiXDk4xp*4yLsAuK13`bR0>+b^^&| zqHWFZg1cNi!_tO!yZb1oJQBmK;aI6>DHhCnsum6*D%!LG{jeZTvWq+d$oi!9WPFW6 zN!gZdg||yD%fQ( z`JArRAhq`j3f;7B_-euzYE$K!a97?xtc|eia1v_kD~o}8-e#jWz*G~L9oKZ~W=~+> zrNCPVN$=c+6S!YmPaxNCf71V)3rv=S&0#D|(A#a8v>c|;UabIl{&?FqX5N1|(hjcO zg~Q2tQ(0C1UQaMJt`&~^2nKM&|L=vePy1*KXJ)F4T0@Y@P+h3@nXtmQtMH}p5X6k7 zH}O8tZ`5k~%H;Fe-Z$9m7l~J=Uf4RIPHDsIQnGf-<`bmU_cyhAPoZkl_cL@s`Prjs zL$c>!6~4LYe+i_3-GgEqHm_G!tEi`}hcI&_3D=2Ky4VOs^!q$A&b;A_VBJkE{&c&B zdoyZPxEjE;*Ne9xMB_tsqHrLv7Q)JqUIf=o#MGijlv|OK&q-TtOdVFB*Zee?;Gunt zypQD}Oci4XoCYYf-Kvy3(IS}B_}>L#> zI-FlUjHTNaj^xg09950c{J5{0{;`Co@BIz7EnMQOi3H1r6k|%HefO=4Mcz^<`r}f#Av%#V!r>$jdx5Lj7HjFhHlJx5mj#GwV$QYO9&|* ziF5p6&nP%u{r*e|{0bz8>C+~OSA~=Lk#bbL28Q#x$i$b|5pa5>-Nm{HNT#-M#JvHf zP{Wh8EpjuzVr{~Qw}SNJre7TFPF>()9{<`JCz^ujw_fe`8ZAIo&Kq+2x_Cxc(mdMho zS#8y-&rp*6>lbYGv*Ub?b$&#j1F6uCvSd;F5mL!>QVsqhasTAfjaS$*pwx8%Lv`a% z`cECPFlwjKr$|+W>~!gBo((^q$nrsaT2<*v*h=7q z9@n6QNyF*eUM*J*OEgNn2{E~ho2rVn83F$X77UcO2&Z^%c*r`y08-cWo4P${2mf9~ zqmkz8Rtt+;;Z&oys}P*Q4H<)Vz`VWSx%e)Du^co)h z%dP(Bwz|CpDuDDJT-OHlKOU)8cNxU~`?c*MO@EU?~(qdG3V?(z`(Q>17DsdZCpD2|?#M~u;Jok;nkx(xR-WE`8h zlmZ??Sp2x|j{ICW;;(m9rTGXUvO8R(t}lAX=(GeN{$7{L6Tzm_aP+ zapt*TSoUyFPW^s~6bX#E7?b7yE9>~}gL_|P`LOKkI`#7s+X%{U_yBVkOk&+d@(dm+ z)t;W_XI+}j@X?~2a>W^d6z{?B820{g`C`2o!rR;v_I-GenQTRfnK&Z3{yOG1#3Wo} z44tQlgJ?Lj45`xS*TJw84G9## zgAc*7u@Bvj@D2y6A+MLu-UygnnIew^P1b0iz}oj}HZuk*^7_5Gy7&H@{p+uPA48If z3erxY9S@YrNDbX40^4el8Zp&>zt?hp^QM|Xije13`8h~gYdw{92TZr~9}9_hAU9=R zgi>@n;=a2Cr!a0>%TD)Yq^>&VUgIkWsp98rfr|jIqL8wlhoSesR>M=SW7kn|R#$t| z!XlL9IG0w7aRULXW6kmFCO`^`t$$?YiCnEh#;Z#R$-BKtN7kS8U;O!c32GTkfh<^R zMsD{%yJ4NRgECwDajE4FF!-l_R%ZYI|6dzCefB@8A{-iT`^oo`2tPR1avw-}Y{8u6 zz56LbM0kF#dt_n_DY_YBzn?*2F}|1&VbWWzt(4FEkE@oq&8A045y|_jtmTWqI^SHz z^Ee+LuDskG}!5Oc+_oPt8eQL?@5) zANp>qFk!aFqB~N*4XLYsmPl#?dN7eFk^s6*{E+EE~-(-Qo!PEs7}Kli$;=*MAFe${$2G?9T zE`SB*0ClMP=(5*&u&aSwa&-7`G+9$Csb1_Oj5TMAOE9i##i=a<8}_55psvWQQ)9lm zf|i?%?h(6&lDwSashf*5y-O!52L~GhK7($8c(lTUeXWpY)XRb2?ArM`? zUQ;9g=b#+hrl;nh|Niyv`pyko*1xZf4&r+O!S~)qZB=|@|NHRxg_;#^LQtLy&8cm3 zA}e`2G_VDfnUf;5crgH#6vh?x#YALVZ8>dgernH$Y!gm>`|5hz!9vgPoJ1Ecj zsRBEINNsMs8j_vC{B<)6*wz2O_TB0WcK6r-S(VrWD`0y=z~@6Cp6l8Qr5o-eYi+E& zjU;j#;^}in!LEhs9!DHY1e~5P=^q7gnq{bCKsYsql=yL&Ku(uG*o1IWvMfH+9unwd zyFy0nnZ)L%!ODdtn-^>61KyiFE!F~%DayX-+P-qDkiPOFYH~h1S&uYbf>TT8##euU za2cVhMGlHyfnsjn&|}((U3-?b53J`$h;g)6-{II|2{SrK^d0+Ufa~T(f>R> zrDop@tOlhnfVu^y@NELcFqTk==9L;Hf6{;3F7RIt)X207c^kxeExnWXN)T6q8jjw9 z!l!p8Q`7F|Ys0J7INSr}b7s~m#{GnK{krxea`xz_6J7fFCqtkfPSx*9WDH{d)0yOrcxdL3z9^3y&I zYZs=ks{OM*o*mY8pNQv4t8S00HTy3?rE3m38@{%#@|vY$;speyc<`6YC^iDoz`ORL zYGk_!MUmavSyf;&WGYY(5^d?D*16Q0`VXK;=}^1w?nOY=(x!F}rCSjsH2dzmHILs0 zMH=mOr&>E7M5?skVc3GHc>AiA*pXlU>?^EVF!lPuU;b)ydRI@aLpi+tXRD3djhfm{ zmlU!G#^03NtJrq{A(H(tko;PYH3EjY3ujNCZC9p_Ci>p?(A5|av)ShezVDAB5b)lS z5xuu?6sPuSUt8@wj+hKi)ZBIg3}5^SU1b~oJ&0%0BCilhF{K^T9v9a)|4 zznyAzVjp+u+us#D`-~&tFfv}%CCaGjDMt! z%N9GTGVU);Ul)!5IAPZ+*&;#;^if+Dx&hrrjkQljq9 zV@{tyNoaq&?Qj{!ypHuRmHT?_g51sr-NDNhh-~UA7>1so4#9A&F@v&RSoG~-n%2|o!sKQ&pk;VIN*W6Z(CVJYRQyXT2&_*&z?FQKjw zyh7e>>optxwHDnY2Y-ci1M(ToVWEvs*r?Sy?dv84eyKEniZeqsjSje!v*_j|Vl9!? zZvj$!hsv#H0E}QxPUoG<7g18hPS*Wf)h=yC!r9PIzh1HpE!lS^9Yjb?>+ioO*6pat z?w|bOE36+>1U$aPZ)7JF^ZKA&g51n&7a|E6T@u%BAUA*XCk|^^wt{D9IVs|QjJ ze$Ol(RF3IdYrm3sXI!_+*b|S}WLv?$1}bB>a<#(L^ScrquCp17`PS|KD(a2?x7X?b z{Y|hlnA)$Qid#M9y~&B$s`pDM%AclD_MgD?@UzDZ*!#$J^uJVgJCWthDO+t{fun$G zm3@5&K_T5u(w0!}f(pqY*1%mj%y^Wvjp6R2s7Gh2P1dtRQH1^Hzs}n8Yuzs|;m@E* zuifVQ5KLm_WKjLr=O|RdGnO140r;|W(|MDAfgr0BnXx^FBsTr->B*X!Jppmav4yGU zCnapB{hsB!@poG{^gPIqCq(@c3d8#*YXx}2zrN1m>&4IuU}RHHV6xeWkoc!sOS%b8 zO`E8v*s}$?84)jgYu36&7}GU^hX(S?nT}_sD_6W6T*y+fp63Z55`B zR1&O9c!z{jOuhv&ABLl$HxvnPFY;oRz-YSwOrq?bIqKH^?NB9=O) zQ=YY-fTVDuE+k(Tri1BoWqP+0PhB{8R)ezwY9Kyf(su+SEFJn^)W7GmbIrl;9?2`wWPNa#uw88az+rz2!{(rEszzZ&&(k z7-$KdtP1jiuq;=<-*y5v26a1|n}SI0*l+5=p{m%9RI+VAl8&4yutE&Lvceac*~_*|Aku_5ewx-2ci9hfq5CPu2p+FkIYLZRX`W(tpq#qDXACr_7$r8$V+x zsqyUzU#5*f5v<%Yp=^-&C@R#0K|U?`W@&O(7`_(iqjx^zmP> zx;EFtw`vG7-+#{EJ01T5Qdi0;$1zmri~TpN-#7EW6zKd>J6&K4`aKCR3Mc1kA*x%xfkI0zAYt}56ZLdWly3>AW}fGn zIn{hLmJ)e1gN*rx|t{{o(Z|iva7m%7=#`8GwWZfRqeFB#GxTK`{x;%xYD~ZB&)r21Qx>Yqm28;_o|OXV!w$u=-=B zw?i0zYN%#XmOJxR>y7m7yFl!6k&@!>{HDZb>MpN6`OOxRVf9cz?Xt6HpctVh?c_*) z$~oClfMhu=@I$JJ$QVK=Sxc8jE=)Neyzy3>ZUd1^=NRvs<7g3Txj3?BSb;pSVZbe% zVxOrKbTeR*x89aIHHU<|@}QF0Jd}Dk{{eobzR;&|sIIfT*hg?NM|%kGQX*(v+i>YJ zh?|-jwK{tR#?AQ2@|Y3M1c>ucyHvPZhHFWHAP-l|c^wQ3BWBMvDp^EH0Y5N9W6*FF z@LdJGDV#d5D%N!J76Kh%hr*T;^TV1Ge3I~?inJ_D{k1o6TQEG;DqOkMlx?q|Cd*T^ z<%Dym$9(Uwa2Ljc<3#LOxN`h%Jz;!5V8(^iY#1P0C40*^Yfy5+YQ2QiS=j5QqGnu(dT_arQkA5Q#ki!`Z4*wgc8YMcEi6qd>~b=R;mv}4o9g|U!9 z@&q8+oaJNTpgjF{hG=G>tOk6*!rJR1bpT*4kyOM{NzX%)8{IoL%)Wu-Tjzrrcz%4p z^}&|_C2Yfpwk&ozv1j)9e^cjL& z1glE4DtrS7Lv^bJ3x^}rTJF1*s75K$T>_$j@!8csMEC?DCT2gaKMuK!+8KrUm#IHP zx&76r#w>)gf>3q-7Z|%>xSRTr@a}>bn3?^sjuG6;hgnRl$=&@B6wc6P0gIm|ew7wl zgTfNO<9NteXZQ>?jhrEPd+qZf3K`5cU);}uh}E*ieDk_}grq*a{IUM}3*j(0T5ZN- zFjX7}PLj2$+>^wA=~sS+#V5X7_?lDLv;6c?J%RE(;qfy@1Yagv`KXs`!G?cpT@mlJ zeccz}vZ@sZvr#zwKX3~}Eii6EDgHUn6PaWQn_IK%BuU% z->cT@8kh>|YJN*@y7B8sNpbJYiCT$RLk~z?~)h?i4tqKU9-8`al;r2SpPGf=Mo4~n3}^)=ly)TjHmmM`o|>%-k?z7t zalf9n20!;ux)MyZjoteQ;W%ee@}AYFJ=MFA)p)cXXblzTR*im-c>_3iV-w zTF;YA`T0ra%OC^W>d7{|xUNAD!R6M07m$*V2?vF3gq7mG>g6{HcU8)BH)$X?qs54h zzr@ZVaBV`<3&I!Lv;vgt8j8~biib7TaWj($?;%5g6Ybj4IUrV6^{-unIp2S}`h7;XkWb$$pZbgaPlw7?{Sug3!f$K+zV^$A zk>Oa5{ar}{`e`;U-066kZ;p%97UGY}`}%VD*(s7fCLBK1WBDA1BLdsE zKP>y1K*5?OUcys=gjfH*XLNBo1LD8|p;|wj3mQ-7IS*-pCao+i8`@5*_NB@uGCCIuOUlx`gB!@>p+CvoJ$tL zl*yh}d)4L!N-|+&W$SRb7|$Cfw-Bm9JFV6%39s9d#iKH&>Q6pFR=x$nWx=#^6y|Pg zDc(j(Yh11{D^RM@Yvl#f4dL$epY2`u`M--K-?wTEcuzQNx1AYd?)xaR+(&Pp3Z{sr z`(N9z1}gr8E45Dg8IauC7{3hY$9i$p)GG5}8<$mWKSz_`;r%RKFbvm?Kwm(~dwjyJ z3e5H~N<_q^qHdIXl5c<66#Ho(+nP=rtv*9ZX5SvK(|%Q1o}($-OW*lrJ-KjFn_QTx zqmdi_oplisu|-cJd;y7)+$>_Vhm<$Mid+M)T6x<9hQ;%%R@Y^)8HErW{}X!)kn+@< zs{@ey^1fLO7GFf@Ce^hImSM`)ByhMrp}Q@>x+-}L=KBd7Qn|NtdkCtu!}>LE-GNXw zrjEhw6qfgeTEJlI`Q?1orQO1*cze&)lzLBMI?7w>7=jew#D{IsEkI#^OCJuj`Y63o zVP(-cE|-3pH$sZK*O=@mM5lv$7`I?5$(gF<$3cnAPRMvdIMSP}YXuq26iQU$fJ*T_ zgCN-hdN=}`gTnct>3POJ599Tn<_om|!~kriR6EJak+5;-AR8A=e!1>vFA%t-!F7>`fx64pOW1Y7ggl{8+1+AE~#pJQl z$EiDv$ny?Dc;e5=v%ie!E?R_0#F)4Thl%C{e82zXtMh!HqIJUSPUEhjP~-cnx0rhW ztBvq>;^JXXIWubvrYxT){^=T0JrYhy4(Cn4FZ$^3PENJ^b{?aoxvOW1>ejX9EcOIB zMXrlOZHX}Zo^*tbw`HkkeR8Mm*Vc8H&wGlut=op@mnh-E72?$}Z1{IuRrzI;#1{n9 zba6p%&w^G*w-Hrk7#=UTvrRx`XrqcuH^W^`K5H%^y)Au=cV=f;!~h&ApIx;b^&*)3 zf7KEY)on$Pd`=f_>wm2)ouoF1kb+z0h%B}vg@tcx+t@ygg>804OT+><1s# zG;kLbkq+fSMB`btXg6~Bdc9UY_kgJ|%@3g+_#5h@yjF*Whe4`;qoh9~Toy7p#X3iU zGWuWC8YK8FedkfZWbjT6@Q?Md`JW#TY|s9Y;{+(}y{{e676?bCw9q>KnM6L~(z9AU zo&(c-=ImTEf%(K?H2Z1~a{=sXOr1h0v0Oxw$9VH)?UOh$=}j%SMJdPS{u?X1dQvuW z1u5$Ka~Ikj30DymeAW(;(LEqVF2ov!{P5s!R-?EWV4p?VZU6+q@Y{j+n_%>3fnEi; z1*f>}ewy2w1C;2Z95?ky)b$CPtUjtS_j3QmSaUPiB%mZsZhcz%mBgN{apxW3q*fQC z)+?Es0<^AL&5}(uxc887SyR(mSGW&Kv91Dc7Xf_=M-USplEwBKLfCdRsLATHq(B4j z)*$pDh~py%#-JSYV(2r&1A->Q4kp9!nq ztKU0YQ_AO%vb4FVc~{*}|0NRU>*1W{`LkhbV}AAf@@%^XFCgH@$|2Ei1iGqHopJT2 zn~)+;&Z_#<&3%-2G^NH+>>Me)zFTeEK>x*&ymFqO-&wx^4ve%;dW)jOvkfI`F}dtc zBM;`I^IbjP4i=a17neH%4sxVm;Xt%;AeTSvLLjQ|4E`L;77p{nHi?)vobv70EH>jG z>c6+&S9k9WBc%VEgGL}3Uue#Uqd@8LH{?4e%+i1H73M9N@{iSG+%Yf-jFI1QC@jP$ zbs~`!tQpZ1C`nL9FD$Q*+SF^V|L)b*JARyp({_Dzu{zl;B$^I61HOwewqCxdyBEZk zuA@Dkei@Y0N7Vvd5sn-_80TSLws#dNYI)GzNq!AQQm>A%=>A75a_z#dMU<3zc-qY3 zMjydY=tnizyqO4-?cP784R)r-#?lHcp+$AwT{55K8)nxYLS7Ec?Mf^g4AZ6g_Bh8% zVm$cERwuPJcR-SVRd-7?cl-DbV(>$y&3i~BY$C42_kpl<*tj^44S$LjmfBO5Y&&0Y zFeSFn03tlL``6jDaJU)GQaRH`iU@15xVrpDD5O*`tJQ5=U-S|7p0Z4-c05K&?bvHA znw|I9Z4elK3e&;IKhMV?>VfnV|hvKODKiW&}xKguDju9*J+jprZb~|0WBGF zy*2*a2$P6aD9Z|)fRz5=+4kVZ=KQK^X?M2tzs}SHcMKgNX|4Ue690=mo@;B#TVWg? ziVOU-B`QTot;P}&pIJ|M}H2|$7~Hpb_;hE z%(&=-saopYljxe^{WWwR0wuW5a!ws7tFC_-N%3o)oUH?4!MhmEC3X}g0(oPyc@B&v z2`M@w;7w?HIEoZ0>}$>4F<7a8a@kG6%s;5EcEZp^kk3ETZcYW6t5rHPQ1Y^`R9AG& zA;iSS>lN{NP*oT`%;*Ip8Cr_@UPMaq+tNMPZ4CfgQk=8QLPnPnQo!?PtPY`EL6L{9 zZlx}LxEkmdw^saXeFAl>BXM0vP}aTcn|6!AL+t_&Ww_CQFdw&sn+VDA%?tCbO5H*s zaqY1hx&&mH*$(M-UD78>Wy~C+jymh*5mGq&Bi-BZxBE|fKRZK@6;upz)=Tep2bP;L z3iG`iVDeK<0kAGG#c%d`KVMUixr^d_3P%I&J?%9x@|^TeO0@?F>fdY0@Q4aCei?6XdX2M=2S$dh@iW8Gu-_ zFU>qdOD#Ovu55hXV{hn5{>vU`$9cm`>t*4sRzr~&5W>!@tz7`|N)&Ia#haiEGsh>& zA*hXmHuqobEBA*jAW>UD$Id|i+tGtZwG3dmd$kSAxAs53U(O`kz{u+G1#9m$o*eAI z$l>Abu+%5pIA=%yqpftj?}Ss!w7YG?XID?D!_75{+l`W{rr*bZRI&$^QSCMJ91;wV zD#Jh4G{9ALnsi12YTwf!AE5RN<<&TruvJM)OzdgydQEut(Sbn0=ut!^2of{TbL&tWGS zwFWTms$r{5ugjp~9d4t-D}WTgx;BYFz+`5Wz%`h_mwS?@9A?@y_%;&u zSMQWB{z{($mp;P)uxt064DkKk{(F9WCOAI1htx^co>mF*K8h4|i)HyhYKu)5Ph3Y8 zYp6-sVd0X(XZfb=QVRftad$A)3uGFBo^)<~AoK_+x-wWkt8!G!^hMIC!I+i+NG`*9 z#`*~YS+D*8mrsS$9=I$00w)v#e1_K5+?Cohd=4v#dwl*O!|%Xft)-U@Ki9`xETeb< z4l|Q>!m7zs_Kk?0_{@(LY*YW$_{oy*=01X1TMcuz^aQ@-)t3z*M7nE#Q-!ef{UQoO zdwa+atKcOOLy>kThM1eD0ml01zK!Nr_g=rjR+^m=WfI`9#cY!t4GKRZpMeav#U zueFjflt^(Vujn1^2_DO-T6+v3^#!g+*R>KzjOK9X&pVM%_S^nH(e|LG5Yz;_$Hoi7 zsR>%}_PCuP16oRRz~Y-R15E0F{(v?@Fu6Ij;HWwVF7_m2n{@D(kYwU$WbX!D{_G~$b+z?KJ>wfJwQ3#%K zYyJNpvhF9w%Ot%MJO>VF#7aoZYSAuQF*UR+YfN{uyX-FOrrRVWkW?j*CbO#Wm|l~R z37JJgrXiD}%G8SK!E0uOZ0un@c+8Gz52L|5^ayv;y-b_wL6fw^fkPs3;J^WmIB?*= zfddB)`+4H~%eQ)Vvq~bq&oADHC!UDsiT_XB03tlD5o?Nh6QQ?3OHfmsPUk_yTj^JK&E1COF?N_*+!2;X8+w`9Vfro#t7hI;C&unUggsau*J;*$5Ci*->X-*0&2qMm zAEpK$BBsorw`a2+!7@8J@q6Y8-EWJ3Rqg6Ye)C8Z!yqi>({1JN84$ZTHjT;W-FMC} zlj93mjqMs?2fCPhYFX+P7)`)#*E_uKzPFbq)*CpI`&>UQ!M7dZ*u36{LFfn^!j^h3 zTfbCQpH5B3*W+@e-l(ITk+d#w1)9>n_tgd#L6{(U+Aq#lL2-Uu&k?fNrZyRVt?trk z=gZr6!kT<*WU<=<##*RkSe}a32`l^Ver7>1QnHu=i!41o0c#D!pSI`UX*qi zDJj_`tfAB>LXTTZV8!hL6fM9(I$RtCWHinNtU@1x6uQq_*5iU{Un6Z-XtKjY?dr#L zJ|wx1B{SRo%4@M6b`Fln`pb*)*+4_vpVr9!92lO4d1EUs&!dEGF7ay_>jFYar;b-# z%-6E&uMy8BFq|3Q$@?-~^*_d8CS8Y;(stLn5UwIf>3Eg)ny{jOx$raKIv}0)bWnYH zBVT^I_ean3?qc1)K)g?|8vWC@CK<&R&5jVm+l;GJlN{6IhPRrcCH_D(nq9 z0b5TH=`DcQk9PbvMHdcp)w}p{**97i1nOIzKN$WN|DxSthhy_}C`YG`wQ<%;#3W+O zTw|$KU956QSvN;n`_+gM?~xfU>!xa*S==?qSmT8C5={p*^!LA_=z__ab-~GcIAtIC zqO={1rK`O){zBS-asz54vak0<<;L!_Q+4WpQx}1e_}ePcW)xZehRyCRf`rW`R_DAG zQt!Vya~weG)od+*4<-f$pr-VN_=b=o3MR2eZNmtXA2Es_6^y}L{7L!ctqH_|E{S^9 zn53wFmqaa2OY=~Y_)wQPE*PO$J%J}-Rr#CW@6*?J-x&Z`JDUxm4Yq$#`aSn`45JA# z&LZ@5gqf}rE9X#TP{(!JQSkF{N_?gstGbXB)V<}o>NzeVq)Y?7p&3JfMsS>!)naSY zq03z|$7^131+3=SZ0gkj`#XNQ1}M;cO-8N@Cv67@_&TZ8nj2{N+tEC7Yg9L3d3pc* zA2pZyRy1xQrBFM1OBA=eg!U}l<9o734|h6Rt%Vrr>5uQCrLad!z3zeXsztwMTJOVY z`+98Sm6LV%;sGLtzxs6}pohYW#R$;cM?exgUNeV9FkCJEbN1Hc=SgCF?wMVE3TS#< z4Oh)Q>%JYV>EH7XHZM*u@}Ul1HZOrRxKHC2{uM$pG8ksV7zGsNXs*$UL-PhJwc!|^3qeKoI zaMn<5BZ7*Vs1tCTgp>5RBOP?Ui#%Iy_-H;tgfLZ}2)E{UJ8zC6mA8w5?)M`F2f;Ak z?&fKSLy4dj-l=mq!-2}tq&5mx!?;oX{yW!i#wZljdk9x?uH?#;T7n>)aC;3SYj|@|7 zD7hLewC%EIi9Fv{5uO9Favu0RX;DDieUS)@Hy1C{a6mB_jau&0xLzTAz3xzFl&X7u zjhbv6S#FL;-=GlBG1uLyCwdEFv0U!!_W|1RW!2EK_rv>SnXTnOlG!$4^op_qMgI0D z+_z`#=o|SLJu5<>jE(ZB`?sB6R>FtVw((vpKn6+x&oXP*t=5B; zUA5vLRCav`)g_#E40MBVgq20SD)>e;Etyl?YDZ1qHYE)#@itQDGB+cX0^X~pzXg!s zY|YiT3ga~&lEpwGsz+vH5Qrw$3GSi%v~6Zag2O=C)`w$9YG5&n5Ph=MQ?IO0bfk!K zs&-1Geh{TJlpC7T&_mrfZWf(tgQRh!V)Q5F`DU_9z!b{n6rJxVwY^cRHM1xY_HKKz zv>s07Onl*kv!Ei3Wvq1$iugvHB*+78=erMvs-L<5hH1y+nq9t#k{og_<`RrW<5j}H z3`7oIY$?-vrTadodkpkZBAJ~_qoiv{Ns_xJsa)^y_{5hIxdF?fo;=sZ1L4RK+=U;Z z+{(|h?!(i1AROqUsswisRPW(hZo4b2_@?hwpZB_`3RW%kK0=aq$j&`WT93jYe1k!F znBecfPuB&*uhll+EyBrj*0T;O_6b6yzN1>$QxN}C?XBl$0a^5V-hH_Eg6oc z%s(^cHUM6Zn!+5NZXNFm1l-%2vevZ{h=?4NYG=Y%p`@&5+OQt17WPGZ+i?vPhE7k^ zlFwQ=iSHTvtZHXnM`&Ya{W?ODYg64mP(kZ8K@~0_u960^w+Poeq4K&s?jm2KhWO zAOA`D%zpty!uvV$u2J*LM8;F!clF;;+MVJCumo#9bE4K3LN2(yOI2-^)=`d>0lx?o?Ba6=+(|7uJ za$|tyfEuU+ay?L^!_C4G_-Ji5ZUK|i(b|?T)3ddUakK_F1E7K(J*(#yOt}X&%kq#* zo6(}7JF+f3j3Bdta*aAFoI0!B<5~hefRa&G<>n2ic9`@aYMA=0rmKfQ?NY_=g~p-e z;Phdw+f5?K;q%%pn(lry$~&Q<&LV_I=Ein@4e;mCQVS>Aqh)7d**&g@?M+b6f#I^b zyT#>s6gk-SquJfBmb06a`im&o+rOiKlrCI?dQE2H41rwk;%_s@tpHx}u3!V@((dng>y^VC+;+D8ZxmGPK{Pqk5GwfOJL zb=#ACJ9erxTeyV$k86NSvjdgn!x=d~2h;9s%UYZzmKR77WIu(Z-f%MSy*>8|K>@e# zGAtLA>*Ie?mf=nJVNPwoh113k8~3p7ZU5*Usxpq$t;S`4zlFzZuVT4yvKi*^4C5;h zFm92>*<9KTrpQAM+tz4iRlfeMk5$m>M6Rbh>N43HP-2GIoDddaErPl|8S$+PB7~~G z^-xki;m$xUj`gA7@#Ec$^9JGQgi=uG{50N{+wxQM)UQ+C49cVWJlt)8lB#Zw(-I1u zPshp=#DH+*ZXH7v`d|<#a<^kE`VcHROdi^*e}+hUteq_w1+oz8xm72@@`Mtt?&$mL zDa`IszJvML{Q)iI5YS_cwXNOL@%+q_3TaFNQCxc2TsYEhG&R2Azo6HGxT@EeEvU^w z5srsF^w(hc@}ka?2YYpQ4n0}c!Q@&OJ&%&YvhVN(Sn>Fulgz~~`gCn0OW;y|HWvgg zci)=#64aIam?!J60$80hi@Datvy82^U56v{W93?>D*8qjrJnwy*>D=+k(qXi;ubS4 z2bB5+Eb+s*f7$)SbAq2c5J8df|}YkcTkjaRSH}fzvkB{F8$njVj96DL zoQj`s_wK~roY-yc`{(pOvHRZtHq94a5_p}w8h}&Xy9}_1eK3(ZjH36#N&2WGmyPdX zBwS4$t8J~(#64ObWDW?IopYF+ZyFL0CP2hi}=qYlu+UCc5496}S5dG~o7PMK zkQ&@k$2P8msRt9OKcN2k`EebdWx#KO_?tgz`CG3ZL>fc;^y-jyIYBIU^400~4j_dG zt8yl49DOgKR(o{kKrg$i)%QMtrLU8zVe>;MWf_@jkMKRpw?~_2??pgj7EqnR1-|^h~YM_o2v&h=%vj^+f(+ypF1E1SGAes%-??9xUpP*2Xp`^7g_0{S+Ng zQ@`;(JrCU#7vdTKq;FJJwLJ($c&FX*=FCbZ4WTh8?|=UT{kyQjS&eNDh)2N`F8+fL zBx-Xda8OveytgEcLx5b)Gb!fA+r=(`=Ewz?L=w|alN^Tb!q z_f722LfHF{KmL{;TsSpV3&dp%&ZFSM3Ivt70HjwlZM9^Zh02Sly=Lb;uNu6BAjS5^ z@MYl$WbtY5o(Nly^B!Ji>lq}tdb_Wl z9e$YKE%dg{$wy#{vR$jHh37>SoG$#VI-(~4-psctxNu~7(p{uuJwW1xh+xqr8>auh|Tyn5m0K%y(fM{OVCV1Bg6pv;E?T)EfM#sTcP@0RN_`7Petrh#NKbvh%V zSrqK?l1-hvU!87geu2-TNQ*;?_Gace5JyKHXrs_T1diy-5O(GMZzJs&h%Lr172S9>2*L&mz&?RQIr{>N?upx zeRwm!6WMNS7If?uf|UBVe?+5&)x8mM8K*mtbj=qC@GcM`E&ec@PWMoHO{4jKyx+xW zYs+|ffKth^K>OG=!fAe@>=vx?kR7 zxZgs(!t-8VS8(2;#J<$z@l-o;s;mB1S_VH2wRy7~>={U>$1}EEftHfghRs~oUsrZS zz05djb#|)|!`vsj*40p?-QT*sHQlemS$p$2#FK%de(!JMBh^##y|Fr)(w3QQhs;L1G-eI#6!JhKsQZbNpGor|jGg`a zJ~p{K)WtS4Yub+xk<_@M4Blko+iNB1X&@Z%#-s7qTzVEQ(m(ipMxaZerld9MJ(~oE z+Z^FsB0B)lYygD$nUi0fXlKMP1mktrRJ}2CF|pwq-d*a@x4upDgLdrSzi+>;9Kv_4 zVryfyVR$u^IXh=o@Tz~lmM^?SUUvts2XU9wI&vtDCVo_J!l^^&^{IZ~VDDYH+n`!> z;FInPlcz05J>A{>u&4Gc?g>X$+ueWgD2d4T(V`4>9dhZy!GnBtx^9g>6b>UM>>A)B zIE-woUAILrZDz{#TdMs8p*MZ2KBLUX)9#ZK5B`GkgQ=f-MCCbzT@#Xjs3!-)qRRS3 zT>+;0Xvv}fSUW=X3L#B`3%1sIyhceWTsoRC7HnGh29;182V@3>umC_$mgOU2 z*|(QU=%rC5SPuJcwzl9_2q*KC7Vmw#5+UDyUMGlG<@Yv7Xd2avwk-KFj#ZJeXfG>st6R z!W}|Lh3xE|85~AQf+kN|$tbMI?X*hoiJ1dk6pkvebP$fD4WUXq4&@u!7%yj5JdUL0 zaJN*3WU~9h?atDfX#}Mz7faQj&VpfL$H4AUmH<`=Hv!1$EMVBQ2mM^)wa2c?T%PZ~ zGgXq#1vs3|&btH13WPAA`V3DIE+J?!^Yz%k<%Cbx`;Au;=GKJz3r9$Lf%fdywfy?= z{vDKGSjlKinV}nyg1AJ|&IQ~AQxHl^k#FVa({l?qYG3en_pJ5^zxqG0aeG3o^pCBxNu;rC9G5Yq~k1sGkf6NkWT(ToM z9-%wI$#T^2s>`We^A!kK-cPPW(0d@ZCbQnQ zeKL#~*7sBkqWcKptiK%H9_ZlyYODvr2#3EY_z)~ReNFWo<4}0BPSZ}hO`=Gi?51n4W;;q<1E{&?97-}d$?FH@Pa;iNZ9nN8f)XE@tj*W+0e0`TnhV{hGwr$B zi*Rb&c!a!aG<&I|ZJ#;Wj3zUXMC=*gV+Vl9!{IM}>3q_|V2>%*q6g!UPYBi(qq?EPEFcjr&7pTs z0D@?%&Fx6{Qv^J1`*@defv~DK?kjWh97-E^RfqEmP3Hwta?e2EC7eRoG;VTx)qP~L z;aZ3+Un4D5Se-SkwBMjbn-(rG3=j@AKUMJ#NFEDg!g_2+DV3K59#sWl=Z5%=O-njRpjn2avS+{S($} zN$~)fLJYR+$_Mi^DeSMa0*5-dr-nb{UF==$u;(PK4$@1t1)6CTg_x^7-&tXW@D7a{ zoC8#dGtV_(I}6FyRxI<)P@aQRnQkAp^_BDaI2XPybU(M3mGFHrKWoB2qxnGec(878 zUj`#02C<&8BDO1N_!>J+DpwQcBbLq!V{0tS7T2NP5|)8*F7Vfti5uv#9Mki*aV!EL zWT-pt9BBEDs|I}wRe{TOB((<=apC86@K=%V1e5Hv0R!EKOCYB`TWwYVg=YuG&NCmn z())?wi+T>}0hGENo;c--q>mmVMQYm|bFGK=9--ji!Zf&tdJ@w2Y`z=}O zfqdn@P#J{5{OBsIO&lr@vLWN@e^@ZJ$jntM97T}k!oBMAg=JQW=_n=6O&o-K!A5Ckf}GE%X}3Z~tr!@P$*FrbGWuXO0rf6>mG5_p<-7CC}$U`DR%T zB|;gl<|Jtak`$*VbPg+pBXnnd)ygV3!u+TfPgaAeq{FzC+?p=_WP1u?Eu5-2qhi)U zQvQCqAzhzOv#(R#ejkFQ&CEC`pwD16BEu%?+ylI8>bHBq1vFY5u&<43wgBO7;fG|u z6%2Q7x>RKhAfzerBPw9TrWEuFUZHjEg7I^1V4La9-t*-rnpQR4xG zq&disrap=#hHO?Ur?L-~!rP>ffw9n-1oM7^sp0ccP9_)2*|88kT=>l46-D~Xe zWO5Nna&Gk080``i(d{}lQ~Ohw5hO})|9#C9uOxOI=%zDQdq4jB$L(zLwZKw>I#PKZ zOlDNfg1mNfqlDdw`ak+Kd0PJr@)>}+FN*o+!YSopoh2c^KuASxo3WYkvisd> zr5f_R>V6+7WBwW}gJ%tm+5>5J9&=zs-sa<}Gi?z^gm(x%TTn}5|AgXiS{h;1^*+^H zlS4vUj-L8C;R)F`&sc#JR_o#b-d7?dyZ-T7fLYajWB66;jcfr@GPK0j=9FttQoda^ z4PFbX)E&DRf?xSX<6RFb*{Ow_(&+<|mANE?w*ig`sXhCRP_H%DZbsXK+Jq+IZ~i)S zAV}Uj>QMC-;U=#-jJFjCcb0jZ6Q%)_aMxb#90ZdsyN#!z?&l+?O&5nji5b0V(W5{T zbLOR;Ks%71kCDFq9fN6iD^Suj&2GC9S5b)0J!u(S>%Mp5r4~Zjhy2d3 zkVbw3NPdQ{YX2sj7GPrS>I~7XSWxj70bG@6Bfs z_%%xEu$E28{0)K%vNBz32ydY@iT-vnFK}s+yzGBk$}HX)eZCwi>>c6suKLXt9p$jI z4>h-5i4wWZ@*wGa&8#kU6=HJQTSvoIcVBUs(@xc_K_RyJ8bhv4c3X6u15;}w z6t7IiS(KDF&Wq1=*q*lfIS=r~V|uC>SgQTtmt9N_q^UcI_3zRA*Med0XdcLWgAi-eJMcMGmkr-`2@KVm<~vYJ`?5y& z%l_wy zVR6<#-oS2cN3i=+I`qiIjXn;Atg-{wH<$n)}uz( z^h6VmyGU#k8g99UuVL+Gh}hfHxmw_sK$)Mc1>~(@B+wQO;zDNtQPoVHt@b+zq&!^J z)o^48fi#bn0EdNBr)O$CXtawvEt#J zNRTMo#f`}>vgKSry<)@5?fDe?$Ah zXxE;0LGc=l&kyS+#r3a$YvY6)aF{wh_eJy6brVI(u0j#+Eug0(+VSmQiEK@cXzt{9 z9+%pJKtAQ+RxRJ_BG)bGwgH%*kF?YK4}dD|A7wZBA=p}bJ45;iO8vK+j!Zy)c7vfT z?2~*;G&}1e#Zxftk@vhg*98oAo}ons#!CkM&tauHweUkV@gl&j0K$9Oech%mZDaEl z8i}1QZ-TG8-|1WJPdM-QrlZxhd1isazeS6v)|*bfgL?Vz8yRH+{+FdjyS+x5%LD5W zR9zrm0m`rMy>IK{g`*vD6x=2Xs}NH}?p2L;%eJjXi_PG=&j3m~Ymk!bY*}) z2~)HMX1Qv$e5+tYG)`axkV22u%JHCZn&EbDjGoX?hfpxbXUeoqBHRuq`ojOM*^LS- zXYMEC;Q$;B%}J}a8;pJsJsEz}zQG|Vne0A&YOGq$I12fkocZNn)ODCiFy^ilN#D#N z5cTh<2Et`@b73B}X9Rio&OyjIw1|IaYg%XFWVoy6JmDNlL_X)mUn+IJi*j^6Pn=!o zDD{?V-QU01edHu^eRK%{+uX{xrK`&zLB9X~`%HmwSQu%?Wv{|%P1SeS*(%C$E#F(2 z7oZ#DL1m&nfc+QlnaevT+G4Z=O+ zIXlr#xW4IP=l0cGIECC_$IRG;eEz$?+s7tsUMjFlTyBnQu*;E>;sDnIwL!lErJDN0 zT=^(w9=eZ4JRs9X3agORRSm#(wyUAkX5BTh{AJ#>CW$C{4z8`u7w|`FVjZMNQ|+Qo z7xUEAX-%yUP64)^@&dR?077c%jCotL>l?doxTZBE+yti(&TO}t(Pk7md~rg7wsf&~ zwS$LS;a=}*j#tjK9{*w4*nxa=to$zwCcJO|9)>{}N6i?wn+(GUOuKWs#50;-mCU%I zn$UrKI$Rh14hp9^nGlM6sEh2*Rqck4cZAvs#_S}5v`@5tbQ-9$96Xq?4ecsVT{f9T zX5c>AVH6-3Z5VF5{~a7F=dK~tLZn(+DiWjn&!ndr5&$=(fIaAI$L{XNKC& zORqUU@PktJ6y_n6j81>vJU%^2G8%Rptu2Dd_joJnlYFfKl=t$dL7$bQVg>+;F`PME zTVZY0;W^^cXszBKscmHK`~`CIUAWdRS-nKihTF~aSHdziFw2#o*Kk#gk)GE21|llO zY71a*!3fiQHz#S|AxW!w;av8=wkF1>3=WqAiObY_jMeyK1(MX-5-PP=8K`_F|KhK6CRIWs@k+cQ9rjswNo-+}yEH{vao(7uED)lFQv90CYc%H!=` zE*K%W?I!6-IMuLw>Eh#b_r=F`dSe#MP@!6{qv}m7=1{}%FwvZaDChu*pA$?5Y{#GI z_TdPjT}@Ti3n;Rz4JF2<`B}NxMba4BrpG0eQo@>4Q0vQqRf^gGxB@D_^(T$+Dv%uc zvtUfF!6}kADQ%Au(e*BhZ4gy^15VRvj)k+he6#y*q^?udveYf4s-k*#{C3i)>UH^p z8GzGXT=6L15O+Jmev^4$-$PIa`rD>C_XCFwK`ZSCU~k4*Quw)bI-EU3CD*1CRso?D zji(uTF2d2^{a;^(HW~=IS=V~yqvtIADKaTpHIUsiAoWn|1#R~LMd7~sN(cR-`_(;& zyolFPJS){E8m~}N>Zyz6 z2&J8MdTNDm+G#t;xe`ptodeKlRwe3@@;$d&IFhepF9>T8D78y77n4w;J~QfKk<8ej;1@42qb8ntdE@rq)zY_ z7L;%{A0F4n=LCte#;nzb&w?s&N2PgASQ7n1BRgn45C+e*jnfOTyoc%0#r#x#Mti{R z5=@$0B-h#Z%TP}%{!q7f&x}l!T|rIb%_B9G9z}@_{0yXO;CBrnGP4kp(c*O^DId!- z;WwaEyX)*@e7@Ai%FT|L=USVrZy_qlsfl{J@ivf>e8>kYwTHt>J*U4vlNt!|rRlq^8;%g`ZiV(Vc8S!ZD-i`;7sHKl;v1K&&yRy@slFMWPG>a;j{s z&S!1`BQbp$nQrZ{+YZJx10Cj6Mizq|KHN$_1V=0r&8?-9ID!>@89qRJxvLJhf>m$CZ6tU zD9@I!X%j}9t}QVHXeFd_w86D?wfq63Wcqnot3i-NhHLsWB%JbCOsK7vVT6=NJ8DM` zM-%;U>GA>Lur#ur`a@yq>no*)(B$P&<>oYWy!+tOLAnp6DR2~BtAp*?!)esWXz{PA zozKEy;dF0DaIT9%OiFMzF|r5S90Q+2Q6_UfqXc0U;KCV!UjR~v!&X9F!C(kF(&_m+ zA1%R4NWE%0S=&FCVaBGsJzWv30@WG=SzgUA>#6WH!SKloTigA+j)IH!Nsy+x!wIFQkd$lDv#n=7g9$Z0 zou30*4TF8;_~AMd^a3IIIC0K=7z8A~p`VOibwAsjF}9-bDA;IYpEn7gJu9KN!YS&m zdVup?2dC>&=d$lMhPmu0bFdu3(wA-Fa76%d`eh{$u~_XV+Ew}ScsuC18c=Hce3i(W zE-I4%uZ4T7X}{1!f1;7L<8$ls)p5f&39b(!tKa_(N)2J!h`Skp4d9YanLE>Jgq}Hb z4WZUeT93XdpGjn3-yRAMhH0+|Xp>uD#d8f?o8AgU)ttd!y5El`ABxu7RT)G`TW4bD z+Wo*GB!%5y_njy=gwcW8?O-ZK12HgBZyp=~Wtb~?2Z5dlo#rs%htneK($aYM<>?2; z0h6HG{H#vaPYZ|B#d1#m!$o7+S+q2d+RCa6NpmQY>va)lp-8N)b?M>r^Hg2sq|m?e zxyKHCegP%Tw2U+m!aGWnOlB)7K?qEKf9CCkp%l|-fvZA<2A;A8sTT8%_-MN}BhF3qT(0ZZ%3G{KYV794)8yGK`|C8z!`K+@T_cL${xR;$*G7Jf+U z^NR-+{;OrL6Fyay_eMBPD|a&A26JPP3D!GUDkr#FP@rYsTWU|{b{+$*m8<1wNx{`G zDX!@7Wb+EW67J2GSsRQ~Ke4ZDM>G>UEya1&jBQ?$y_aX}3+st0gbwVqk4l8gcp**8xv&B#3J;!K?MfMeO6MI`wcnHbbE1)x?sW%FOZ4*m;YR*0am%1D4()lvQe`eA#a{-plVp8{LDw5#?WMfW6m{SD@nH0c^y7*h)GaP`V)Owk9!uZ&2Ohog6UN={}ZmlcnLreYb zswf-s?YBSp7UdVlMa?%E-%U_VYoxMN6PuGjU01KM$(AJW)mP@cTZO~;XY=*2_&|pj z|3#fy8HAUn;-*1U?S9n|VoJC0k1f5B+V4)1D9!E-R{vd2W?te{*x zjWtzFo&(dYk55c^?SgupM@pm6r`1vZ3kW^=*EFK73SC4?V`#RZZV{M9T|$kp>uKpW zmu!l089ljJ&^K%(kSj=P+nq22ec`Zwdb0K+ufce<9M$%0uY+k;JN6nDKoPX@esjlp zv!giCf4CNTZlQ$ZugA@|(UQrLwsds|4o`K8pv`RVc2V3o$K$;uGj3(c#l_46@1vpPodQT2 zQzvS3?KzP2KPk)pqWi#!-lhKhWj@&1rbw><71>r;+wAW(Lby0QQ@7>bz~O?k!Pbi3 z<`V{IyA>Xie20)sKdqZk%eJ=CFZ@;QJS_)OFP~7f6;Kjuj&Sr%D^W5QcOT0of~6w> zNka>yTdRSj|52^IuK^|L7HSRn)_(nCHIsFLQw#sPEt0GUHR|skG9D02-rj*aRY!+6 zph&1rhGBbS_xq`d+$-FKl1A>tP95OhjG$im9;b?1x?de&EAy}wA#AQc`1>qE_j~OM z`h5@~jmg?bd0#X#KnhRoRb;6Sca%L7XQ#?x=qO4m#62?wJ^+WSqhnU?*@Q%xHEL`} z!w$iUrBpSR8HbXEZM0L>0Z*df(;(KS1G|t2lQoqrj}$CJ5?J_AJ4QYSMNi6Yy92}$ zJ&Q&V&SYWm9FTHvtLi!5!NavycL7vt>mQdivgSPVV!obe>*|+)6mMw0CKs2x&mALV z0>Cu+?M~LZ09so!SGzbfwfT7sWG1*3Ff^2Z=ywf+p@}f3n2`! zQg_V|Iy`P(W=sGpUd=wreBJGS-tN>R{qL|tXc)c^V`iW}eITqROu_5I^g~EOt^%}f z`B5M>`lNq|_JbH7vG+sufiSt7imCLc2-W!Y8;%VIaj6V>5nTCox_jM#vhW*k-_q$u`9op{3A>LfB& z4pP?$ryyP_!p2${i`9Wp`$YV_4F&RpA(yt%mmj|W!3XqRIQ@~kqMbS1h`Ll9`<1@g zgc8o`ia7ynMhIt4l^<)@^tU9A6MMC0M2)s0RKsgiya9m3%70{;mcj1(I#ARqX(-=Y zIiZKuvkxPX*IcSn`Cfsto)#z12sW&rCOXjA5^>H z!i8efK-LqhE@{*Mfjz0QamQW7eUr4xIe6`!u?7e#XsmYb7z0SvovDsx z*|sK^6YV_oav(z6?sYFV0PeK|MbTIJU?p0$BX*?pzx(2twQkA}S39m{@HGi9+>p{* zVdAg;i{b+*{3i}C)d*lcii~y1ZXhZ^U!rTC^4I`KvkksBcAuKba4tvjQKDCM^mnvw zfNaj!c7e5)Edle#>WS5@0Os*p&3*tcc5} zon?{ay)MFzc9-gY@V<8bj)8grN5JN5Q>8Tw0b1DLl8PDVqA)@Zv(z+y5iR2TkQd+_ zQe+BHl?FGvhFAkAywvdsmH7;TbepMM_*r>geVz}@LTX_4B8W@(I(+srs5cmW1(gl^ zgR)_-fiU=4$>j~GR;KD=$=mLy{(7|U9jLYL>Zk47m)NnkT`h1qLQ*+gox}=IDg)K* zRwiuGl$r}mVqEiBEtnGRbVvk`YY^y2KR-${Q{(%MFPq6(i=1ZErfMRtOCrZ9!ul=| z>ytIk?#pi@M7BY&RLH={06*aP*q1ok)O|IopYYXY1hVcQZ#QSRKxrCQ$es5VVJnj2 zEuDNH03{uZJot6=c{`?6qeDp2v-eyphQm-Aw?{nvItnYfb&A?6xJ}s+QpBRwGWS6g z`Hd)YdAd3T5?i^~9v75#Sr)nuq>XG>DbrArF=@6QN8?dq)awRgy}lv)xqLKp;?%;w z(z`>ch6%%KKc7QTbsv1kwCTLC%uE*jZ~;g$a$e~ojHzGyHHw{YzN(7}m-CI~Ij!(Y z_s!`xCA-?;vGGdtS|WS57q{1esL#CF^s%s0Y^4@se^wNAbJz?7yv8S?-oSsX2=dk?{6F_dG4 zj}S;^R+-wYd=ajecepI*lP)&jNVU!T6hWmq;9CRMXHW{Q=4+q$5=}7x{Rtvc3z9G${ALQu+2z zcnFnpBNWG4qu47Tvij^9Ll3j(+>th^&t+iH~j;5h(+fBz4>^DDJD@aI8 zxhU&^l=!sq0KJA&MSJ4LhGrur<=O}3KS@<>Kuf81YY7{n*wH$VU_#dpo^C>wbRB(d z{oCez!$8*p$`+`ndX~%Uv^`-}xG$b|6_Y6P@rs=DEEIC+1* zaDx$R2YtIZzx<2OneXW?j)v9l=CTNA%40TOx1fH?8X!by_DPO#K9@m2qG6&o4OGxM zgfy@I>f6tQD$f}!b$$T|FE(Ar`EkeJMYPDZ4&+n|yo93Jw4U{HKBofMyb>^WtkmHu zkQ&`Zn%AI6f0746$zDeY8>j0a-VHF!yS`iouFa5cqRPqif^`rkH)rLHhm_+s9KLq3 zP_=G)2PJu&$bHSbT@)*^KP{tk4+T>TKRUATBlZF6wbhwA!tnr3Tm5`u+VDr34?}v~ z+qi6d{YY8)TxuBnkotPD`}**FbCt(pASvYO-jdr>uxVc#q&$NZ?w5Zqo9BthJhw%K z7x~Gtd>y`U>SS#EOwH6a!dHo2ZLLJ{8dQg6_SuIn{;v%7(Na*}BiOxnu!2^fwX_?z zY{ya*nl6jwFxEA#W~5etvf#2o16bL`oor`WRt1;e;dZKWHLPSOs!6R84*TUn+K^2< zS&J4q*PY(B7qkv3wP_^P49R*F418h+T3t#Xm?C^SHf<`g0q%8(htE`BT{^oF5tnW+ zYJi&nTv#AAED)Asj(vVIRX3rxB<^pwLHbr8+3#SJ>PXQ5O87pbPs_w%5UpoIEYc|4 z5JKAcL-GKUA}!pslTh{nOcr&hCRXPVTC&)2?2B2+jOQEr zdt}bsW)cjepBQ?#g+G+=FlZrZHqospHU*jkC8rMSb!A54ECQiS*Ch9xa8IOd$Lf5( zmDO+^-@5=3W<9^e3Ut4ntapbl6MTs&kmD#WrKOf|R74G?muXUVLVWEdYpN zX|R4hzdioLdI#tRkn**Y?<(RZ3XUD|WCMgH`^i#6yA4YT$88pQCs7a8*3eyHVlij0 zp~F2WEOH=FyLglziTU5Vo8AjX=pM(eaos}%Mdus(*A4W6w6xy6`f6#5h>>~?UY|gC z+`fIUK|y|T|D)F3pFt6X9-tQbU>rNZ+}0#s4fU-MrhK1TFoOW>3xVd_WO&w)v{dhA<@|s)t5jM8u`{Q+XVXH8fzV`u5 z?xMN5EZSg)-4G+#AvjEZJhF`zgVDt5?VtJ7gvyN;lC21Y5opoX+?v#iWbK&g7C z6>cTgx8Kv*3xcBbHT9mQLRcm>)_{=$8#`}GV?2= z_X0sO5C5W$2)^thkCj`%R~>YRg{FgP1C~B(vqzLSiDCt~4Q=0oJvFl^Nzc0!4%Zs| zvR$o&AMY5U=)y65HNkH?eJhaib)9C_pTm*gr_AM-5?GZ4+Lf)<#z+4q;)QcQq@4pW6UJERGAvn3P-V`qK&b>5G^+7kMTpegLHG7T@p`RGqFxW9 z(y-jwmv&nGM)&iXHnX_d;l&#zggdHCv^2ZDrC)cu1a>vQ z3-@5!syn>uq9{=&DP%YOBl7! zz5+2HeJVrsxH2Cdn>cJBPVvE|rr&emwi-zRj`A>0`++o3rx^`x>3@Ec42!RssAEod4$m1OpUAU(zZEyboED^H;Wjz>B z2Wlufpt;mx=IZ3eyAKbK)w4d6`MHJ*hcPw{V$;e@ZBfj2k+~w(uIAtfK!bF}f~KQj z@BQ`E{8xW;ABd`>$l8L6u`E>&B;S$WBd?2~to&Lw<`Sd;)-!#086ei#+C{sPa2w-Z z%_qn0lxb(zATqFKI)9=j{MUn}Z@$Ky+<;SqOufL?saTEPFm|8gfMO(@1em=8sgJy$aU|^zmeBxmjW3JreJOZnr zH3eN1jv_nqG3D$sllG*ewHq$2r+$*`-FfSmvd{oWh zrEs;jFUvE@t3;~@ClvKHq%>_rSDC*FCQn_mx_;Z?d3Hl@4omiU&3l*a?lh|IT{YL} zI(5{rI(5=Ot@2-ilv3)nW-Y|aXeC;Nestn+z3RCNA=Nb3j^uz!Q=Z_JW=(#ix%Lt% zc0Mkb6S*w8uKRfJ*cn<4Cc`>1VOvEKeP|MFb9kLQ6fUWcyWP0l*nKq9+#yzvxd|y0>eo^z4tsoIijZ>xpVVPPCsY|JYP}tIm)_qgG%@A60*w!|z zhT(`|M{Vqng7|Fas@?mtkpFcmwYYDQjrOB8Q&64cAwZ^Pd1Xb%p=5B3OC8D|Z0s=6 z&aF+uJzMNvX>Twxi%7+X_tAgB=!>NySFY$kN>6s}04O%kA>sNM>6Q=r^I-Dy3%xz*X4%N;yX2QaSWGdD?f^ulQ`)rO>V4WSzKsdio$2umkt z=Fd#j$;BJ_c&1Kv+!PK!ng=U!E8muhp#A|Tcyy%t0WZEfs z%5SE`Dmmtu5#Z)@GztWn1B@rJvRPh*G}+ zgj9WPcUqWqwtg@lsj3sral#N>o%(O<)Ds*Urr(0fB35$*bO2OXUOwGUXw`8LCGCEf zGfOx>gpkH(0m+y)ERLgwgF47j!)}_6mNM4Ko4P4LB-4ngB#u?-*esm%y}nT6q&Wn9 ze(Qgy=z^*Ra|cay&vjqU%V`%utFK2;5#uu@ql-YQjV$e@>+QSmEl$IiyBKXt z%4F|KN8#HdJ~>drofBKN)N&0Wu0YG&)`z;kA6t0%V`G6t=IhiJX<3CE2oaUT^Twgx zOh&zl8qWA*X&zv1p(tma5@Drp2PyTQZ43gWP!lK1W940d@aou3v;6nq%*&lq=$#o* zMfZ_a>gTv9rF{TarMi+e&%H0QR?mJWK8g8Wvnlkl51z z>+g)xGa&k3&xahZGab*7D(TARg)lz!nXQ$+1S3ovwKaHpg&<*@A{xSL0GBm8IOQy| z(!N1SQ`y-(qrFXx)>dtfv9a$^@NNoFr}qfQZRz5u zU@UFzu>A^s$^firPGXSxAb`6$4mqkl4}nQh__O&DpI|!9&0LDGNUiC;7!|bu_k&)nqd2SvUe5srA&k zz_PwIsW}T0&JK3$oM39hJ2Pz;_B=}R;|Wy9SXy`iEn3)mZil>T(0vg#tPf8(i@`h~ zDCN3ZsR|3AAD06icNCvH;wl}`6bu_mgby~HmejSaR7XF&^Ery_r@o7B_ zx4o8kZla}>N9so7El`#wE%Lt&;ANzA?2a%-t8K;1+zo*m4Aj$oh6V_|l_wrFiXp;^PSgZ6?IL3L+lb+8{i!#U>%T-uZ_~PrL`VWot`O2)_sXxmu3tSfHbL{ z%wC;kVgb-nPhVIosdcDL9c85FS$}i)T|4`osQ_AUp?1cKvmsbhfvw%gPWpHbn@#8_ zbqBECs~SWhAj3ROY6#Hi+BBgp=L{pHNGIHWwxCyUR*fR29?c!x-XgUFNu?Z#Fb};{ zIJQv{CL4zkQY`N@w#o50N(#?~Wbet$Ni>yNj=*Fx4W&7_KvlZi=Gn8Tq&an3)0h)Z zo1=XU1gOn@dp!e@U+RUKs?zga#66#EXBfa<+guB*>8Q!A_(Gta!xWgg~R_yEr;LAPw#*K zTMU74SR9`(TXqMI0BeX_li<7E$0zFn-Fslk?sE^TF$IuZW@~x*fw20mK7$62~on@Bi~h_5F~>-gAgkjb}At8K62$e24afX$fVd z*FvfL$sEO7hmclVo_*RH{ZFk6bkXX4YxUU|LP;u%!yCHqy8BiNzY#Tsf85p%nFADJ z8)*mok_hAHW$K#vY(Yo`9jlq>RuHR}ZjZ=v0E)($iFmBM1{{NEBsI`h9ETFUUJTHL zhJ)JTBaMejw|~)eIsd+}R|dY@h7jP1OUpQ>RMfELiP(RvFF-OWmNa%6uLQcS94kqIUsK1=UG= z?eHQ(GIOrUk%iK(OKA9Q?2`IrP+`x==t}pwK4M(@R}tj4opl!T8bp5aFmydXVfpM9M{PQ^f$jiO zc&rJ zvQNf|^L&@UzPdDb0gUuGCHm>(cJ=Kd8YY{AkxRlDODnjX9~OQ@K3DQX`Dvr*KrfXa z8xH+#9WS|t7UQlDvz2MNL)osQrZxxW>-6FcI5l;|<1j4L%`T}u^(dZgVx2u&N;z&Y zSf{IQcc1NbJ5r6`>7uB>=BM&5O5{IYudL*x@;$`JViyZ=KdIyjWVM?I2ueQv1%-Yn z9G%w=8H9X!dfsvYg8&hWK_2#=04W1=RQvHy6W1)IOvy7SMRwRu^ylz@E|{9fzv!sO z2=3iDVM{GuqK1j_V&WCJR9oum7mqI(51?UX*vbWK05El19LRl}PpwcKt(OAcfl5A- zj(*v`#&V@$A<(&wG~|kATegRC?}0RNTE)8ua^bLPaNQtY264_9hE#b z3HKTXzvA`bW|Sq}=8>o7m0OUoQVV{?`PM+{&`!1tB#GWtz^3_KgzdHEHPpdh-jl&F zn93foGK?^afFDoC>h6V?__3Vl*Ms@>y$`-m;e|0$Yu`0}7zdSctQ~}z1d^37b+T}4x(g`2f(N96TDCpn(vL;Iqyblpd;%%Avka}= zPXqPNZ3E#M7=@30c%Gk?#1VFy4pzPY@&{6S36OCA-ci~uERUJ#g`dl-l!z-<*>}!{_qN65gi^=w=27dx)GvT z1*^I88>3K)zZxOMfA4+D4Y4x6ZoPS}Ag=mrzcc;bIuv3xY-^^ac2w3Q$#?8aytbNb+R%d>fwXKypUb6`yayZP>5DkYT2Wxe0)nf}vnnsJ%pnd7w+dfr;MY z?Ljm`?fc-+x2Su7&PY{^cd-Y}AH4b?y-BpR%G}9mOiUvx?)IhIX|vsjZq&_t6kXPH zXc*O?4wr>G3#R;@Qp_a!Tw*+SwX^!-^MUm;R_(H02z1$Tt|p%s1GOs00S)lM)b*6n z-epLU>S6&Kke{rzl>T0YQWF|U+f}`W5;6DZ>fQBxfYUKPFm6EFfiWfJ7p`Wq;8{8n#Q*A7fu2YHluST|X!8%m2IOm_A61R*KgcB8RR;Rx2mYQ+AhEuE~s<4LaZ%|U{hbKI0Ul!tR(r8yp zB>4`M{I=Q*UG`foJXvWi7nT*Sybi0U2gBzH=T5RGvl2~77XEEpuU-Xd0%peQZ8eZ8 z{w=YQhs-x4VV0{N-){kw!?>#^Fa?Kar&QVV!Ng2&%2Va)x$E7>jO1<<8w@IJL)#sxCi! zb5{~e0;lKeu+CLD3gkSrI2;+&V)QlSsw*eHi?{1wPa^GTqte~zNX`Z3z|2h~+;CaT z9&`zq>aqr4)axT1=baAKo$0$!63<;3U*AJgH^XCf8T39t62JMCu|Vh}txg^SOA@s0 zWdwjA|0Cm!%VJ_4cNIdH52Q3>pOf0t!1A9^W0z;3^oMuSaX~4}*GDgeqsIHKG}H19 z8}!NTRX*FRJg)_@Yl<%3n}84eQEMQTz+22@JF+FpyF|(Q;IjQo#kWf{+s<4rM@q@+ zmYEDzAf)wg8#`SipsJ&lh-9kLYDBcE`-VS{`BHX-UN2b=J0_0#WK9>zMVcD9uSHPc z_R#M-;hx=UyDsbD7zfS=e^xK?^mUX8Exd-v8}j{VT`<@vObVuopBkC=ay?H^hk7EC>NZg^RhIn-3!T6&!**U-Pmw9#dI$9VV zaYaSzxq%SD8sVtcn{br6F5TSfejS-8ZM+RvRo3o{S>zosBAi?J>*>0Jbho1%nXy+) zsP|BoIvj^(YUf%)_ff+KZ$2m0&;x{y4-M}joC-FMH9OI+A0hRGRzrfev%H8F4ko{t zovCHyCn#F%FaPPhX806Hs@}q;|Igs0>XblDl$i&lh<$kfJ{BU-l&_sLdfEN@W%Jbu zN>AS~w(ytg@pUkhs@@grO^2OBU?Shb+N4IvKnRCbZM^xdS~jvI>;Aol0Z@L|+?p%& z72WS6dzc0&jP}=TbyfG{FV)2A?#JJ$dpT=BY1!FbIy|v96r8$k+dN>e>pt00a$66k zM%wWLh5;cJ*DmPR83XNQ1FH66B30G65mLVIzV{yMARJkHr`~UyVJ5h)P;C*E_6Toz zSzOqPppy3NsA<*!kb1Q?@sr2R3;iHkHMGA0%{hj^r2S!=X%6RSrD1|~`+?1CvJupQ z?pJTg==#A_`qasz(>hJ|0ZIOr_q3~VK;lYd+Nz=^;Ygxh=Vnc&5yD{2lv9FPwA99z zr8aX(14D+ATK!p=n7kEMDV_`Z($b~m&gavGznW`{{ujWocl@OF+-j2-kz!-K>0BLn zZFgKk#B;y4!w@8mcHNVFu7G5Hlq{|awkr8H3y@g2ay40x4@zNduD0%P1i0dHmevEw zux<<(u9T2&A!Zn%x#wAr>XdFHE4dLuJNIx0l;4>;1$|dIP5898YG;_Fdx?w#r(vjl z7oqg8MDqY8rQgZZz*LF*AySxW_ijm~l)w9O&)CU!++Yz!X}D{lh)=qo>(Q37J5Nzk zX|&Z8vKsF*wA8G%)LNx{j!@0ay^(5WFF;Ag!P`q9ZEh#WbjAkx@krY&f8G7)Y(yzO zoRmNGa0`ClCbk6$;9X*q>#3$1%SIb-yH9$!YsnZaM=MiNchXk~M<}{BrcM$oJ5pW* zVq8`sg+VLF@M>6L>as(9u?9?KI5ycXkgZJwzNL*OO$9)W=0m~t`Gp+rKE6oxyH9tO zb2@dh0fl4^x64Tz6T2FFal0wMO*qU#>j4el!QWcv*%Bn?ha|W)-z|+720$g=v7HSV zOhi6)%BjW>q-57Pwf6dC}pQmO$863WF<*s z&Mt4S-yQc$#ApgZOD2O`tEr{!>F#%1VKpp(Q`MEUMl_cQN}L6kvvBmLrcM?BGNL($ z9E-8DTI~6Jg<<=wy7mijSnhXS?xqU7h?1_jpZUMkeeS9r0|15#Z`#$tjw=YHxwmGQ zSGym#x$A6-c?}_FG{5%0P;<@e=y<9_o7KbKNE+Y#CUbsMIAW>0j;uj`TllwxeLJ7w zeXib$yc1ZS#wwM&U;kPUMc&JY+lTt;zpx_Jt#T0`KvDEInK3lbnjRvi?e|YloYng2 zen*PCPoFL#r3qM@ua|w$o}ekrNd}|L*HbWMHm}a1$6wT`n`a%pPOP8ku97`R*TnfI zk>v}3ZhY^MIf7s${pr;4+Le8Upb#hR#u<{ohL&pEs+1#TZ3F8Ka>O@iF@Qbj@R!Z8 z_&Znz<1F^Fj~n0Aye;=E2UG3;V4f*}lDkoN^>r&ln5d(jZSKA*pR0`i!$#Yy!6j3! z%dV9ywg53{zW2YGAS4N2jKmGjx{hECu;x4K5zn zGmh6RHvwVSmA2B$%?O(1!jEX>7U7;rwri8LwxY?&yFu;7+yF>y!`{hhc6|`8#81{% z*-(DO_wUWuXlocwQ*m}B+gYO>sTMNYt^5PsS6uJ;>L45zKXHiCxu!!1Ve9MDBI9Vi z`-X@0Q)im3Aec$?2=bRMElfkW^@N(b&}wGk=&+Zn=j)}CITXb+isxk0upl2eTfk^h z+H?82YT^supYOh(;|sV3kQY$GEUQ^#{EG-Z8D?u5Q3lZURtDg-m;ZF}PuT~s z$7C%4H>+?bQ6B#zos7xJU63HB$J)}%Jt#SRVjeu9uJ7k}lIi;mmIBfYPT6E(A-c#8 zNo(Q^1WKedyYSbwn7N41Q_7y}l%|wVx_HMY+6K#01biFSwdKrbVE9&Hdr8LmJ{MH;s1lvZ{zYVKES{HsvH*-pbB?S3_a+-K(O zEwKntB-Hvt&Sjc})*>fe_jR?ub#NvD+F#x6n>teF0X5aVx2~)8fxX`4|1yNnEgikf z2K4HZKGuD06z(q;3a*v{_?F zdLpesBpESYjD)aMP7%_uVDdCbs|!)12q|4WydQv*R5`EHf3pr<6a#tq5G?7FHTxSE zPI@DAmhM@DE;4^o@HC8Vji*hIX9Jz{wjawDfRY;hoLtWer={tT%WKPz83VK={|QwT z>3kQdZM7b)b9NWd${^0yS-p#ZDz$c1#}qC>a$mS%H4^8SLD|{rV*~P&#`R@QC9Z-g zcMhBr%{8QC&Xm;x`1O2#vc~l{gnM)NT68i_FR9-|m0zt~RqMV5CQ%-#r)(Ru14v2q z7}w4|xr0y@Rc7~Y_hawb+TA>I<$U+f{&^ z*z|aL6!{3yVcUD1-8Td6S(*j!&S&qE#i+$mDZ>j9)mvM>A=yQcWZ$nll8c39x%rsm zv6`osz+pc-oRX7E5mNGYK>fGG0E#v|I5uuZq8S8|j%~wl1=y2^Um*|M^Jyy+rF7qt zDyu-He`S^7E*KW~Ojo{h_xBBa)T-;D(xqR`$I8wXBIN)~m$BK*AVithAt`fps$CQIt9fM1FATiogvNeha;YhpMyezG}DKf)}ra1eW z<>_`di&H(po@Msjkf7>X{vlpZ+wWey8>65S|4$#!56Ok@+%!D?Z zi1M1+UAq#;;GWRbo$vNeXGsBcEh65bI(Rw}SO$tEBTjayaJZsHPr-_5frl@rf%J^c zGB^{2WpxI0HWB$)_tm-lL@L$;eCMICZ?jc?ZB<=BNylMIS}9-b2z%mj@Df5=ZqvML zqRU8fAK1KMXtSZ;eZdf;tg@~mV9dHt8To?6vv~)i*Yl;jyfu~GfE1@W;f#)W6GbI$ zWhw0~Ap9As5qcYx7IrZkuW=9e1Z(f2ld+FxjBt%$RnWzsY;xf~EIA{qJ%RNAR0&y? zNPE~tE`Ni@IjkV7UjDV8G{EzvoTsOVvnTlyyQ+&mP0YGrXoLgGSQloSn(++AyjpFy z$&x0yV0%mX4aZJKIG{{N$9C;6?R!xmYdcr+jPgYNvOhx{h=9^A4iVbQe<@;kVumz? zmcc4@D3{BZL%4CImN_eg)6jN*>jLpg1X=Hyjjy;>iTl>zu;HDbM)y%TFmeqj6MKVW z?M3BR;dH*aQ${wVwTHGD(Si{Rbtp!GXe!Nm)HEZPqK0%}a}#Hb^@C}}zTSS6F^whz z7p&_h%wYHZ);ez(29?H5bf!E?w>?;z1^4w(uf;3T4`JZ zrTXjK)aWnQA%(32G`f{>oiAn7ha#!k|Ww5aZ$6>AsV=>wf+d zpD>nz2rJuc49YI03QX`bhuZ~dIoCOcS?#b_-xDj?Sr+is~!tswbD={ zO0o!QeXum>#res0ec4Pbfx_0;YU6QfezfJ@6nAJ|B_Y}Q*2TE$=2#bE<^z<@M~cL{B?ZCql!4?9WB}|c+mY5mb|wkDX#QJ9!BlE`YPu=r<0$f;Dkrs*UF6;M zRKzrx^w@orbd}-k^8^k}wVkJ9D2B}EKW?B4_LQJ{fVQsM8mK2wGu~cWX~08DT#s%W z?10Qyy;#Tn!eNR{ESx+8_c}te_{(>PC(DoEIY5&8OjUa2`R+^FrVeod?y0-ED9|+a zi>Rr3AJgq}_H+qF$u~>la*`N2yjz{%E5X>d+1k%whpWTR)-vQ;540_?>p+URvBumD zP>uH0D&nSaa>R({nC=!z>ezp{ru*BlWa|+tL%sWv#Rc|%7|d78+_!7VaS!e_Z*A{1 zlid4Bz?qiR9sptV5v6hoAosb8O(YIjPXfJ)sSK6onZj*Y7C z=Kufj0rk&=1|RvM*Nz`E*pq^2Z`U-x;B)hZi_iO-M;4-mHN)?jUKa&=D!T1;E(X(^ zdq-@>e)oYQ8~b2HBNxV{$1Up!qb8af`Uv5t^{O@DibS|uPCu^Q`CC&uLU?BX`r!L> z-^0gM-A5ib!>H9=oagtz*PyAZ(OMJOt56CubF`Ueu0_ypLt8$#*^mcPO?OKS_dq`2 zNE%}utdy8?LH$q)Y;^pQ(GG_RV|&Qq(w+v9w8hQerdJJj(Wl01Ol*f!ouRrn)TF`= zw6s^V)jn3Xk4-M_^n-HUty{QGNk_pNYcKtR8D6;8H(Y+Io;Zo3MhAD5dBZdiF+0CM zBJWwmRCDg@cycW8K<}o|aWEB|9672MC*b73=RwQ=BuYl*U2d+LaInL43RRz+C4EjO z_-V4>jG)qtIYBhh14(+Uz1naN?!>SH>V4;tdL!)w`>d~!0ZG9vm2a5iyPsWuYv#F^ z^0Q~JRrfNG3=Aqxpe394+~M1596G%u#FM zn+Rd11JrqAs9T7=0-wa5?+^tjDMUO(-gzG58(vbknEbGdanKc0@&MMeM)tQep8HU^ z!ss%$j2|GWhik=cVe>FhL3zs}4Zvh*S!1+4hEwqEE^cix`6t35o?El44{H>&p~m_M z(3@P@CJORpyE4~yCl>t1d{^0D2aOBC2#z}^i2*4581;Iz&6`q*7I$B4Ds^cI*c)() zP7zs((CbeBIz~>*P*QyUpY)RDaE3;iOh{I@Skc9?lP}qoFoqm1>t3I%0##`D`z5Sb z2RIcMaZUH(E_=8HK(Omm@gcPqRwL^C+;<6pF81#C%m2W7xVO5!#wN=^K`l_!XjAJb@Q#m%yq?LOF2OQ&O? z)II4xMDPx8gm*Uf^ebh)PCGzI z*Rg$j=)E=q9-^ft88oBB9j!DH^P)_t6`@I!a0Y3vC?vZ`a5rx4CtRQz?#xgUY^ z?RTUO;aU%ShDu;eRl_aV(ZbE~(?a3Y{Q&*H9Ski(3crV%*1s6mcEhAE(Ey~oFbk}W zzNH8__4+pYfuLe-EswP%0@S-dQ4Y^@rM)@cT;Z3Lur5T--*Q};ubHNluCWSAKJS)O z76Jkx{cEsx^VWc}{f$qM4uY!2FyDeeIQ**@eoJs-*E+;h=#A!*b3Lq_Repo550Voj zS|zEqQ$K>7SWz_>zXOR~ze#Hls*X&^5eNCSuIdvs8hLx-Hs?>|0MwI-WrTaAa@&cP zCO?3=r9_P;k>Bk7(GCz)?dM-62n2gKKg$uL%Q1%2S4k(c>U5V9uN|#h%%UiG*_Kty z91Cn_O=Q>cK)q{-s$iLFuWE zyJA0g87U1r(@w{)zm_vLt1IQ6T8!%U&q&@gE3+X^7c^*|hH&S!6c_`Y!?2_YzF zChVn1--1$|&$JCk;sGUPp$9UgQSv)T@|#}+Zh z-o0N}c#t0zXy4&_1K}Z@nGEx<+-a|+*73t5boFSHn-$*Ud=>)*$vlCj$?A!$dJ1G9 z9WGl&@*%NhGW=HGThs?wV(?U5pryf-BRDU4_LOT}0WXGAjy zVNL&D>rP8hanYsuQgD_+DTrO^8a>Mp6m$37cb%6l52)1+#Q{i>Uatekm0*wCWi?S+ z-72JHSYE^?j@Z7{r&ptkyIy zCB^8u10z0=QjD?54_sB=38&6>>L^sUCFC}W8n$kpI9SJf;|O7%(=;OF`S07dy+uk- zcZ89;T0ocR2z#eS>ZI%#LNd1NtH&INlj2NiFDIV=+>9Dd!d5^v&DFZEl&$KnR0}^C zP_Ieiv|vj4PMi;)=@Q{rM>jnSYZDghI`BE5I!=tM&*vv5!aB_bKv`tgCcBG?sq@x@ znU|n&-VTw|!{X0nv>tzQ8gT_dPNjN~5#WFh27oxibv5vYy^v@NRIxLJ%K|6T30Y6;LA8Z1g`R$m&* zkn{ZBb;)HY5`25GZ^Q6rVgO7F+N-r2UL{?DmZ5R9?$1@aV!qkY5z0^= z2X-PzTgi^$@n{g6fqH#z98w4mB;m_s_hq|9Xs{!Mccb=*+oO83U7XSzthbMaFn6)+ z1xwdIju4?4m;UXB>g-ra5#7ocje-oqEpsfHO221DxvGgUONJB_wU$ zUyIDk!r^p#eZ$fs-(xI8LrrlN#!RiC=U?k$Z;czD>m9*9a<$8id_FkUwup%XI6Ru> zGN+`^t;DDk=7yM=m&wY>hTPU1v-e1SZi@->n>8X0PdNEwg<^0Jy0#H>OGCb1ODcMp4 zoTxV>)M{A}tAlE{on#tJMc6jt(JWkT(Tt&RU+@Jh?Q!2 z*7=S>52m&U$Bx!!>~#bM_*bh9_wKhLPt|Con+U1Ls3Rz&9qtLvHsIT;>^5S0#QU{E zy^};N8hVS84-wL*oV#?Ewf9hyvCJF<#&1IYpV2J z;~pWLX}6-(_({Ib)A3IMrL(yFLG9Cg1j@GA>{Qo!2B&bW|H`#s+Z|c3Grfnq1fw1Z z$M)s%_eBWFaf`)3Dcp+_e`_l&o2d2(bt-uvaI`h)3(jg2~d*3QHu=~ zSkZl=drM*^tcix+HueP##iM2Xy;@jnFh{D5jUKnxAmU>=6*u|;MIqh~m0OgxfyTj1 z>BZ{;OHfxYU*G-8Y|@|yCG^SpKIr=#_fTVl>eebf`9yNC1RL0Iz@bb{r?sl&KxOQv{$i z%GPrKLARVmDOOdR923UDFaD&-Ur@=WdTu~YB-Y&bl;Wgt>fzNS>ko;X%7-g|lZu}Y z6>49pB4_f;@POehh*ih@8Qy?m@z+oi?>v;Y^)N?m1YST0V?I+3OD}>cc6+`CXD)RS zETv58T>_qWDCvI%p?YvB0L7ZC!Jd2V?E%0F!@mSsa~+Udz1LZBZv<|t!CS){Osj6% zY;Z$qRd*!5OTULR*vYlJ4qZFvcM?exP}~gL58>3$lbN*~yN3{Kus%&^cP`tj_mP!K zPrGVL2f~9xwa9qb!I^U0@hDNfKXusX1~NW-PHWoo@<~Un2Vp0efYv~Likjxz%-Lj9 z*FQqS-VJrfI|X@$qL#g@Sk(WK`3yXJShGSr1fizC&G4qAlPl&Tv^3RgBWy5bxEMtb z+qFx*mn8i8y4kr@SY@sCE$){AsX>*gX`uSWa?~WIA!lYZ)rurAaZn#xDXd{;#%9Z! zZ55PynO@5EgQ1TU4(=)c2y61~r^+eztKCnd_0a8FP<b#Sj$pUb=Ti1M>? zD{<2YrEJY`(?|VDMAsi|7A6B-B2}WpannASL`1st#{>YBj=_y*se0Qx0+ZSay2cI! z)#_tuz{yCHyE~CE&Z1mb8x61>q6FhWI_FGVGfl#2pt=M;+H63k(bU3e9^gjEgexlT{`q-SooCg0ZFWI89M}E6Q@bt~QPLu1-*_o;-!2pF z%lXA{DDNm&f>i27oAOsX^y~7y20fQOn0Or_OsfUJVQW+(0gYt*`Cldu1XGGJqpWH7 zT?$^lukC`{2}`9a;Nd(!>p*q#{ZKDK1COf=~yLbHy;hG&VO zs20cVHRuJq<~u{P16T;A!H%+Rx4qYvyo*qKUC<31qdy-pQ=ZsMQY`_)Tsn?xr6rM; zqJ>>EV=k39AHn%*=)D7u{>T9^ZNw_7W~vnk>BQA_R(3y+k~s*g5Ryxss2rFs!L%AJ z1!GO1qpyMWtbHY0UlmR_rwWq>V6t<;zKlZFA(W`%T7d?T$aNxHT1_9AbXo69s~^^k z6Ly{s^$%|~@eQDaQL|3I4)#3%4#L_aIQP4_bQ)Af2WO}XiAv|b)d)nUqRW!WaU|3!gOg*OSlw`bg_P2WB zZ^`Diu;OFU+nZ?w`Dk-#_9A`=sw~H`^S*%-yB?HR+57pq&9v=A=|KoMR-P69*4PiB zdQYi1^Z^76{@`Tonmh(l2OHpqeiyk^ab5UnBF|l_OB){nD)6NpFB|m2=?h2OBS{N( z&kxP#?*bsnXv6TPezQHGl3t3aVlNJ|qOC2dB~a#2OTzM?S)*ksqRjR-H$lS5OuP7C zIh;}tm4l5H9elAm_DWC_jghab1k-};88zq7hBsQxKHo3x`0l@{v+^}aO6pKc#;@l0 zVwe-b)^cl+lHY4>zdx{afU=2QpRe8Z*5UdB4!3Q`ejxH{)4Yq-0VI6T7PZMb2&F3z zF>b<`VT9Dw>y=+MykRB#(igtC$_$?$y(7b{N#vdRalF~vjsm@^9^BOI{|s?7Jp8qF zW}``A8Si4Iu9(vh9u0lgs7|Bw@@{jWLj6sl}jz4!};vv5+JnY*sWXJ8q>wb8ULn97sY_1!syu5oF^%!IItt+S*s>AK|C^J2ZvJhpp zoAI{>H-d_?mui-}*TJaawKeC)P!y~i$ntW!q95GMCpym2%H&pl`_h+wlL~XY`|b6& zw;1GsL~Gk;{t!&t)%y}L0J+zFF$v!g9jgB>l|_Z@EUlnf(O zQQh@5(z{>xkBybV`VNFN++cHbs(?H50oSA!0HZ((f25c^9%y8?r(h<#uU{*FWu*#D zBSp+I)2XwvX%cc4F&(?@Nq2q$kW*H-gKJCmI6|ttg=Ho|0M9qO1J$+`I*C@}ewF#* zl(0tePERSmr@PpMZqIA%8MN3cSS6PIak+c-)>#SUvq|ajv?=JEaOycx*9OjaaHz(` z1u*5BY1f4=!YTMh4>2g{B?L^is>QR*fD%sarQuxZVlt7d7PyL#el%)x`Wn=0!P-S> z%h>B^(wb@}NH+rLffL4lzMcEl!PsSwE454vs@X%dbe368n%XswM@Z^AQPwYyg~P*lvp%m7DBnI*=W;Ba`Vj!{iOqyUx zfc#wU74#-b>u7%VRCtZb@$PFI33cIN5`kdfLugG4X80TFaYSMGBc?Sr_tsd)I?C9g zJyVk;%W;$@rfPw80#Mv?+*0eIlb}SKYR(6EhRG?!RQGNCJPjp_<{`;y^-R(j8$H~1 z+|Gh&afNO!Dh+zHG>RUe-sjDF88VvQk>&|0l5X`ghzGDI&W_bGzR^7@bQ-u+(QDkT35A<4T#rV$tHX?XYi zw%0;f>aM1e!Ha<2kito+FGk22t`@BWx1PGR1U+5B`Zm}9P*Uk_wQsQu#B;r??5LJQ zm5Fg>((k^#b0*uKEAwsZMlxLmrWC^^0*!kF?CPu8agDH~NDrKTH6I#d%)kuz{MOg@ z-q!&J%x6Dq#!s~QUfl;uZlr45FPz4yi@)v6Z2(CQhnxTSK_FeaXSk(&!`(N87P~_2 zza1$>Cg@5b*@2+`x<^fQJE1UVOW$Tg9aMn1Z@Kb2E}R17>2IBT5-CkK;|`bcp6DEg z8|K2&LO%t<68Dw)2DfxO*rFK6r zAw-zH|9r3Eau=)I!qpUypiK_8x&JB>)01(poY#CT*Rueo%tASzn1u`C0c#fA*gj>}|g~88y@v0%^~Ix(K%j z?1}vE)KZZ|K$%}ZvC}Em*AQKTtXO3oE5oHwYU1#_mZQrMlI*+n`*N_?HsxIWgrq4) zD^P1(mWBRG;f#Sg#v&7fYqn~y`>uvluxZv|WeBk5ImIHh9-w;_CAnD9m!}{f6B4Ll zDl@S|yPI*>I<$0Dx+ZUFl%4fRN=!v++lzf*_{s$~HN342`VmveTxA%5@ww0DD&+uB z8>d|)hH$E{Ly;C89Ip9lM?Rxc8V@^xRIKcnYgs&s5@uSJSG~p)fAnx|O-=%mwuEb| z-D$8ljN3*k#>^%L)0xM>%`Pjq;w$-|TR-UiEe$c=z)O~jrK zc)z)AI0Gn1ZDZH|;aO0Ju(q%6aV{|j-zwv(^NBh4y~%b&Z~>HmcYW$&;!f)?2G=+vL7t*8v#u1 zvFT+<^y4;|GH*5`Mk|E~yN;-wEE_w^iv+4^xqAVwAa zevk>{avS%;SX<)O5C+0}JA$^4kcN>W2kMC+4NxLwJ7Q!ax637b@;1N@@h&S+C1aR6P2pB zVSGHXOgz6$6#!&j<`31{C&9EZYnM`lP9gLJhs`+JJCz`x=&P^4X+P;q_sKSkGriz! z_xpx=lHnYf>i&AOi8&9aBMg}sOKZQ-#Tas}k}~i-fq?;I9ZZ>+YnRlxjDSN5@OS^# zK!3T_wA4sXT^vvRYp3&2 z?V>N3XyqG=o0IC|3sDr`jdbm^2uK1tEGf{z&z3}70#c|qZZrP#ix;%p$XW)b+U?~a z1ze7jYS%*-L_mk{{Oj6hTM4IS@0=n5x?d?VIK8^Vo0>B7eweC4?TMX;ru;GC7c&>U#0wk4oEMvs?1B$}nu-2UeP*~9SP9F)-!M(L+ z9?pk*c;Ghbfz*1Fg)>V7g1Abn_BZvt1he^I1~XJbtW;)$h@@I+Q++qJK4Uc|uqY%+1?Xr;}iMHw%IK z`V@lJomI=ziE1FU&DJxCT2pXD$=UpFtDwD9dJYVON6SxKv)M(`{9Bp2HJ3*hl7tjp zE~x{Liy>^QCsvz0x`d!f?w+lON-hK8Bim*hnKitxpeet>+Rn+Zc2N%2OG<4fyoRQ_ zQ~NRKdZN<^W~=Ys0Mj4ig7;>AR-pbem%Rn{`ba$kTYHGNyDxgTd|=&#MpPymA3|YY zpL^UI@E$^nJhYp$yS8|~pCowa%F9>};N(Jp)Uzuj0Fqic6Kp41kGdGI+xlw{Ydl6$ zsTWuOByB&jmAc)WcnYRcTi$%_E#trY*>;ujA9&|#Za6I1H=p|vUz3OiKT-;972ose zun0{(t*J@^FNRW2M#puRX9+^jsQW(~{N5g`16>lKOUhA3`AATJWoSL?(y}!!XI`8e z*8Ef}q2+!}mv6pmR)CeS8-zYx38iWZ|2HJW^T>1GBOp3BSjOLLz+Pvbe6N(^R}nC6 z!?richlJ<8OC+un&XA=@H)VG{N;2BxoStatVz{B$UK;A}C@wFI)bY>&ioQyLqkax{ zKi3{^oem76s2I~44^I*kUJ{FB$GV%P^G<+FXx?$ zUG2W~b~lp|VygQ0HDskKZ^mQ>kXf~KyLS0rS#O}G{ZzE3%$o?xq?+E;AUnF>=?Nv< zZX-z3jH@T!0W#)!usYmkw-1r9iJT}CoO?m<9c-I__aRdF&=D&ILCIJpm=GQU;mdHF zdmeSThBsXS?s1?FWt!RQ6SS0brexex5YIjJMN%ZZuhfiZ!f8s9Ss!ZC!-DE+nk!L~;_G!PunJ7+_>C*!u^J(z$Vqk?9;`v@WN-P_ zB_xpMwHB@mjk>icSmXeosOUb}?N#@-$-BP$U})q(^9J9Cl4j{^KF|8Q7>w2asLsCz zx@72dWo=4gAZdTc*oB56c{cxF+w%*JpoZy={KD}LEkan8+gk%V_h^3FRR((F!f8L= zqNoCGL`)*i*KKNl?e9&alzg>+f*rF#y&dLbP<4%6cyL@e%x$kOp8%84%*dXxGTl0f zqKh8?0-Yh>c~+3pAYAG5j;m46Kxvar;9yHt$6!PHQU)KGQ z>-o|^e!b3uZ{*9FQESqh!ius{|GOoaHZB=!c*9B5d{8?@cMu{rW)5@H)*eWrA)v~t znU@;nP`aPDD2y9B2MFrS-?b_2;XJ7A^GCqER4fVf7!EVrl?Q$0Nutjj zE=#(n0Djqtb}VDCg9>byjQvt+XU`B6ZSK0~v8)LuTkH+nHuZ1VW~_s~%loeSnRRO8 zvhDmW@=J!*0nYbwOtUy2W8wSt-pmqE%2U&xTrt!k8J9FNH|n;_^7HJjsdB8ny!*Dj zpt%B6$!1Sea$#k@mh`j>G^(`<0T17!UDUnennzYA4s&2mw~DZ)`)G2iJ=gXsLJI9R zKr6Ljju!SE7-MROMe9&>nlJ5GZQnpxGnCa9b)bv7XXFS=KFbA^bVes{#&^E#D;0H6 zSbcGft$vBaVT820$-tl%e|zE|sx|BmVM*~1&~_7d=JWC*6bCk=2ugBfq6{9#fuuzq zMXR37_h#pIay$)U{^6OXe9c0scr*L^tI{2g zBp^BbFIR`B@+s}T?KzwVlREQ^-lgv|Xp-~6UT;1hvp;|kKEFkaFFv0`iL7e(X!f=( z>+^_mwPMEb3;D)@?R+UOqNMY?QX#A{qg2l-M2mWoZ$~_TOgiM-S^?-*A3^C>`{={%$p2YB=1;l#^IdRYe%tJD z?P*}L4qBTj3;m!(-ms@cE&@`9<_s4D7o((UUM~wx*Hg%c-X~mI5D!4IJ}~$FdW(#J z=u#m@C(0H5a+I(kr+O=3HE(;ZWJHo6hM>WZ&#K{vQ)oy;M z>q2R^4^WijQ12a-Q;BSXqzDN2vTUy&oq<&*CxU15odWN#mwC>CigVB!dT;L(?>v&? zw98Q!gwuse2Q%Z~%|*17r7ZyP{St~U!G~wL0lN$-+1$4?*IfZqnqilqsRW6v0Cgql zTKDTgyE6mOBDrNpRp?;52z(z*?(N;ba`~(eJxC(HaqF>GMh_9R+{%AN z90yq7}sKqf0UGN?+0I{ z6@aSonXz(@zAk|IZ4LSW(?9Xq2c*E$?R2gm#tYVvwH6uZetmbWY@i0=@R%ui9Tp8E zXs1uVVw5NDmb&9py2cJT>>;{XsnzmfCtAw*(o3t2cQD*>$T#t9*B)YP$g^< z8qL#yqU~neA?H~rLw}g}4|huq_wM6er7j=uVw9Q3%pU#h1d`$&Q@oRc>4;6iDfepf zKZTkKPEQ=G>G?E*5==6nvkU;jtTJ?_5#&ctz54MSpg`2a+7oK0K$6;O5;E#LeBiyA-!kqK*|Xo0z5;|#%_;m>jQjjN_w9P==^7ADGk)kj4tE_* zMLe0^J5IWR5Y}&acgVz_1n&M`ZI9e~{#!dL0CDMK8ugA~xXgoslVm*n5b5WB;{Pxn z@76OU*D&s{>DA?nvALGJ=XaGp7Q5{GPz}NZ`q~5G6k%rW&yDd84i={$1zOCu?(;Yw zzP)XWQ7#-l)+WkRsMm7yZxUTG+EFpoRvkf#DbMo#=U;JMYQa=1ggfX4yKrhWz$~O0 zXDvcXTR0apS{5ftANyBUNry@vEkWw-5R^I`2S!xTrQN5jEv3E;PE#1gBLt$7EJss% zidgmA6%Zzj@T6R4CM!Yxi&svvbX8*3_K=@eL+Oik7uKh15PIFf(-?II_-aRKuP+5&P6D8&lyws@zQFss4~ObHe}5OHOa@${mDm89>14nr|HvUu zgxb~PLBtfQROhn%8}2BUNmpubbUTVl-uX9``3@jOvZ-Pjk>R`(6_eP}9JGQ3;ewHn#QNsWSU2TPWrc0@67*j%|l&V=X+xQ`| z&mpE_1apbk^BrMia=H|l3kb?xV?z61gs_pkQIqq4uA47`#L!A;vp ziy*Jjv`s=NWy$JZBTyCceR(9SIpPM0#V>zpB~c;V>vsRHhB|qHl7hdrt*_onyN!T% z+iFLM=FwIONXl0yr8Vq61SROw0hK^F333a#tLB;eD7|4bV(r%2>_HMxg{`|ckrrT@ z$8DHgKYWyQYhgGh!oOl%JwY4Q(LEo@L~kDwpB`m2>00dEi>YG z2oDr;_BU3M8-nR+pZvtjLrQaB&Pp)6h$T>oj*i4swJtM|A4rlP z{w&Fn?+Iv+WDpzqWz#$4g`k`c(Vp9cz7E#orl;D;;`;om4eorib^!X|>VYmK5)pth z*Um&0Y9LVDpa0aSs0v`JTc^`CsE1K7V^8e|ZSQ{FPFK{(w(`S-`Nqm15Z4zN;iCbMp7jQ5F((O<%G3QG5A8F)yvnlt|kwJ|26D*fAFM1==Oj~IY=5PHL)x2^PHM|o$%fM*W~2$YEHEp~+O2fb!8z`X}h zI6J)U?akx@Tx0Hir#5rn)l!cTQvdR*+F0^9DU@cQ4o|uiyrrQBJWUGN&r4NpihR^X zhy&wiaJo^mVbh3)_JZGPWvp`^Xdx5zgu^B0zokpS z)u3;Z1%kcV?@GOLRvH|UfQoB6(xHi6F@av&rSV$36SxjeVMpF?ckb3B$o<$JPO$*CEHsKa^fEh@?1Q`bAPfP^p{m2ZX#lg*QZ?M69eA)iE>p9a!QWkaHxhSh8$n0N61Sb+5$rX2^A zvVVi&ANsq^-R4Ok6)F{@c~d=wlxF!g;`?-BQ<;soGeGz~ICikR*>V;&4K#YFj3Cc- z_(*(;!t))zr25xWgSm-&vHLWRv@XGzRDO93eINx}`3Yk`QJdGVa*A~o zPKq^{v!Sl}t|8;BcawV?BG^0NUmJ$I=_)Xk3Is!6SsZy{(%PmeUoe;d@s zTT8LLBdm0uN0a}D0oKnLbuXZ-PYik>UEzyga&`6r7?#^ zjd?lORso#uGuFGa{G_AMd-v68!Bdnp4qqtWp7$H~5h}K9$g2KT zTfXJ4<<;)@u?byrEvW0*$Eb6ubx>Mh>sZgoWql%)mM;H3NdD!fCGHOU5tPp%Ut0tW zfJ**})rP-dwNwegQVxe#6-L48Q0A>c6RGDYNjIz0JI}~j?+c7jeFN4+BSh3*S@lYgs_awP& zALB&#oh|F#heqlFo|9F7i{1Gi9Ga*LOCTo3686 zGMguh8|Pr$ka;}^uFU5VBIUBJ5#0b06C5mwaS_1jxBILT1abB?rq-ngK=PUUuHCCE z`C{%;*>GG1(v`MNnGvt$H)^axUk6ez9$D1$2Al#7a1Scgn+UQSqPW}=>`Cmp#cqLo z8zs91G|09?P)ksbRlipMi37hUb#op&>knaE+wddv&^;uj=Cqbr5UxJ;l{%s)7QnQh z1M1(eS(8G5R{e||ek532Y3I+6^Qn238zEW%n4;RLJ6J~jPf--5nV!^i@)0aw)|ZC9 za4Nmwa4ZUDT0!hW{y}w!g`nSiI-PngLXh>5c4=ubAZ0479a0Un1eWrlI^tWJh;!dA zr%KE6!_>6lxm;LP4U(pcuK=}VPfu70g>7$**Rp06oX*xR`;~ZDo$qUyw9yXsSoQy4 z7F#C+9&MfnR-h*4wJ36#nDJDMF%Px=f;;PlRcKdx@U5@=@XjA(N!X7PL0ZOO9Im!y z??67HXGv`k2oK8`ObLe(QVIV5^q%c-iev1xvACoAq?!$bb|NUz)++0$aOzC2WnZJ? zjw31L)^-*&38Y9pFBwf2o=%i9&o}-7$?d8u8pnh)23{-g!p8&6i*0}IL}E7gwkL&^ zd#eqPre2f9wENlaqc5&B%AxSau1cMZoJUX-(E(HpA82iRn^(GsY&VMkR}sSR&rm3erPomK z<4a%qm~k)Mn;PrFr58);?Cl1s(e)Y*-4skYhHARF1*SDOwzpJn!_{Hh{gyjj+&60Z z_aP{s>HWUDCoILI$oB=4g!2;hevqHu++rm@Q=y)EP(AxUJo8ve2Hydbdu`Zpzd zo}i_C8%uM33Z}M4%MriiKSJqspgQ3y(leyALp!%N$Is3uY(AYnE$fo#7JS&T95scJO5N>R5kflw zZ%$QKCY_ob;z0~fPn;z|Rzs;V8?xp>X-yZu|8PA!{wkatSf*AEYZ35t>WD(F>wX-b zYL!@zAa$d+I%OXe?oH2qpG!Crv%e!bM4~#1HGtG}{9pcyuPghtnnO$W4<^0$oTaPn zFqG21;le>3uWm<4cX+Ms1?+$`^VPX)s3rL{l>C&Y-6@zw2{&BstIdaF`QGUTqc9>N(3X%#^(X;d^dUMD;dBzgz zn`@n37HBj9^yIX4Wbb>`|4%0+?RKOLxX*yhqu7 zD^a3r2uA4Jn)CYe-zYwm>>CJaliJ8BbxwLW5i=U!>|Tbsl|-gB*=@n_qx{a*w!)o` zGI8)o^D+KmMiWk-98E|-MUoeXJD$Jz|9PD!QuJN8cWrMu;k`1 zZ!8LA_u1skVz63#vMjZh0AW`-x5a9I?AoOOZ>O&Jbz1)eK&d$rG#e}R!krbu(rU#h-W9-LQ zlLvw+RsU4o=dH0Zjh6o0bIDx9S;Vx!W|vlV@M8$dezeT)jti%*uea^y6R-?kveA7~ z5L4{<)XMHukRJ6L9PgZlBr#JP9cP4-#HMzo{4CsCE?(8&%J;}$Wmoc?Us96k_^3UP z5I&5VpIM&GzOQ^2x&&wEzDM3%gcU^AZ<14&x=+MMIo$flIJ9&MsiK!c)+Pzgr%r_I>Ry%vQgp<=?`5x1AZ=<9} z8|!@yvH(F!%KCa~J9n{!)%_m$R+$w)0Fu_hsWu5eM3|qNZHxCD zSU&2grPkOxXqwGq)Ksr5F{uw-Vx=yX0)Zf3a~RLH9|d?3&6@357qu)|RBFMIR)@IC zqcDKI?q34yNcE{jXi{7K!xLY(JDB*|xIH?z1WZx*oiq4JqPs0qJzp(p{N09(k;P)-4`J2 zuG7YP)$}454tO|r-%L%gmyl%C+}U65#jZt|ZhZyBoOj)o#z>~Mr6(N=C@+sXhjxo#mNO$0_hQm==VvT_SdH;A9E5^V$kD)wz?hry-x-AO`T zSGY3oBK<=o{NS|))9&SW0-~KbmzH@yX*dl1U8>&Sk_26xttP2vQ1B25%jSN!?TbEw z6paBsZ)W%ybro=?#f^0_OykMjL%HkymS&+@x6F^w&ae!Z_M0e{{?(L{8UYq_C^3h!&x!+J<8_jW6zg*34#Kx&aE-Q z-MBvB%5l?VmUnTKxBABl1Xb5gRo0b3%37+^DkvqgYc*C(}lFvI8c4z z;9eK0sRf|E&|Y(#7EbNhztwRUS%8F@<)BVN$Drzzd;7)pnU>=S|#*Z`dtS}HRZ7;64zJ68OD!dwbMh-ly&;|{rcTPPCcTlGzpe=7e znCmE5aj-pLasx<`b?DYsQ#X;)kKU>ys#}T9v`bBH1HBoN3cPEir2HMU6rm0$XpJRh zK15VyF{<<4gA}3^tlqV=`zQ)wf1ml-gZ!-1yi3b39z8@!KEA2lI9z&!md?m>UCkcD zk%GIY=rgvn$w6g&f-dQIn-cOAz}ksYpFYYDEH=b?2Bem6RA*XntbMd+s)qMMAUu=1 z!v>=tC2T4+ycK6Ll9UgYvFs9IwJl?Ng!d z;mu#`N@RWWHyx&|5|l$<<*-^<0rD!Ig|5M}1}QaQWNW;G#qq-vwfnmkN)YfisApZp`ZNY#30QdY0+8 zYMg_~jVn%aVMl(}#M|mt*v=0A@WdY*=3tT@VVa}(p z!YoRUll4qay<}C&-7(};VY6+Kwy}R436BoDkm$P;kRrd^vrsr0sEE6#NQP5^G#z@q z;Sa`*!&ZItM=lDS@qtdpB z-SQ??Akp8kYK;}kWrQ}Q+91nWP>(p9i>KdSqv^kiFlfSB**dp*GL3(&8YKM2U!j&#Eus~ zY5ErqS3M5(*NUoLSE8kF(;Ayvv$rJtD&&-KphV;9r1F00{cE~Z>Y97bHDB#$*wA~* zYHgRqzB%gMy1;C~lMdY%C7JTvXCE5g)2B&;{=}!~wwZe%zrFPN-*|~&0MevyPSyKj z!>~$H;|T^~)z>k1in#;IfbkY&)nq3^wQpO#k9NOqt6Pl30zwjI{s2$H;aG9&&i88h zJKfP7SC{o)Nr>m)wSA^zaM(9e(*1bgI}Vp^a z1&^dw8kc}x>ra?IF9%M(cXX=mhFt;4%HelOg?x2rqV(l!0M1W%(N3)E`RTRh@bU(L zuiG~NI=h}>f&fnTu4t96e+wal%f0c;@VC1-2R!nt9(NE@Ynt_RJyAvtKvUAgJ)56< z;Jl=68>RXuhTca`v)=i$8X^z!6_uP|qzr(`{NUWT<51`kQu1ncn%0y=16r>y(M2Wu z1R=?41eJISr``O%yWVd3s7q$g$bq@PoU9vI&(NaIGp|Uw)o!in`=Icif1@}J>F-D-=$f`SfFzgNeup=oLK|O~MI}udxmwxGGn{cB*N>LUy%@B4RDLHsJu1zG9DD$1GjE0)o0(pQe zBOargJKxzZDRb_mqk4)0QVk5+o3=O(WomTPW)H|dkT@q$Wmo2_M<+(h%gIR;1>^+Q zXgUS-nw_B$fdEI!y~4)w3=|H}%>5-%0H#eg#RBvkLh3S9I<-QaN72PTO+;SE$K%#O zmGVVU<1#rV2NFI!X{mNuI9+7#SUu}h3hxy(MWhjs5?2E(bM?P#KuR=FE1&D2;*z?x zvv&hZ`8Ft@%HKp#K7EgK;P=XXz%3LhdoZhyZU9JZ!g+no40pir!fxmvkpXZ@)_df~ z_xI2;$hOr}31wkH3?M262~$$`L6^+dv6|l>c9@Z<-ygxL!slvz`xs0vJstUP2mmy^ z8mtS(PleMWFa3gn-+kQIE+GU`rJC8uf-lZ9fU;CN%0dJmTiWl37D1_?LqnBW94w`n zF)o2M-lslh%nRZZgGD;RGN30!>;6Y6P0Nwe^0OoRKe#Tj6)34z-%^yN>eFblqY*SC>e=T}uHF5* zjj?YS@m)d(m}^xJINrr7#Vpg(3AEJG&8DU)QVy$VdIbF&(#An+YxeSI;uQ>#*IrIvOqA+vO2d?H@u{XR<7C@<{X)&in z*OAi0-)twkH{i&`-f2sFP3IcGYI(Lr}Nm3Wi9?SN-M#0wG`7=Pvc@ z3DlG7xkAu}-&537-MwTjjNKn0^;makq}?QahLURbW(@Lv!SVSqLx(Gq?oz21c0?jg zRoX4SMTj+8YeltKxHsB4#H${?r2C}pD#Y4&X&3EHZc~aK%R(vBf8Dc;`j?~VWglNf zVhE0W+X)C^#as6Y+XjLWFXgX&YG$fl4H!i1O@VP3Ka8Lf zwR_PlT(-kum9q+adX@?ug^TZ*+E&|%qM}|z{9}@$`?>a%s+W(W=qNQ8)*jR(m~z-& z)soHFY&uCCZM}RpAJnkexQUPel~Hnanj^pC2w}AI4ZJ!5r%%}?+EW%G)mA4_lh5Jj zHkPLlHN)5>J^ZwAs>TsP+y6V$QD}F~Kh#;2@Mrhb%-m%H13_J#(rUc(-LG3_n(guh zgtXd*(YhT<)44)R4vRREL8#1{AQX__5*t^(ablk&Ln^PymBF%*+r zasWwklXDkK%eo&#A*-hdA3#`QL}-hLKrd-VWfRTq!y~k$M=#+6OFbVWX$P{3NDz)7 z;op{m0H$>vm8;1|-OpZ-w=C$OgA@YbOY;rHW_gA#wh#fo+cVf1#*4Zr4p~aHEPhTX zI}oEEp*M2>v0i_m&hcwTGxAYWD~`2BCrj|GK-J9*7_)i_d?2d~j^?8L{j>LhXvtb=urZyHD@ zbcYjnf4lY*X3^3L#edUuJ7hnG+B^Dp5c;FFIPRA;Dyu_6{{*a%2j?{K$%NZ0ai@e+ z#Ik6xcTF-NC3{)@b<8==pcNa+g&mdlEU5X~!mzEh&V|sPJ8DM}=R??H+VcXOo_isL zUq5W5G>z*bLTXeVS@K#f@qn6k_Uu60Q@o6l5_-phRMwVPkg7CS6%cQ(g6T|0SaRta z*SbV#Xsy#O%fakNo{a|;CLRrUFUoVOMTx6$9d%BBQT73hup7PJx&DKAFStt zo`5~MZ?|Ue)SI}; zB72>L2Fj0&AWy829B4g>$8~~EyaORSv!&~o*W`Nnj|f2z7e7Owk3wObYgO_ehZS$2 z-Ta#bQoJp^+||r7ouBLaJ(&TdzfRZr*0KD?`+!6YBx)N96$QDgZ@q@+GQ zZYN98PNAg8M9_b3(xEM6PNP4!K)U;lyRP5rV}tASwa7n@!j98J-LfOs`OU*in-K5&RA#@U@A3yxc0BEba=~=nq;m96T4bK z6I|U#rDn{8Bo`Q1cNmeH}=JGvDx)eQ4*cTww&bPab0A!XTRkw~rR z@SXp!YH|-&lfIs=bN~6z|8Gq+5Aw6YK2)z|J_J(|H;*nU`lI~r;J>|K_ZU<#y<1jK z0KNV0{Bx5n##2N?#E79(2q(4P_3LK{Ve6)vfEN6p4elPVtp;)dl=uh#x^_4g0n*u1 z+rf*4rDNt{^i|8jB_X_SLpqmEmm*}0ZM4_vie=lUT83&c&nWA1!Ct=`+v^UF(X#?6 zg)s<82T+Cv8_AHLOmu%#*AQ2CF?YRB30wm@92*%O-CTDjpPSUa>IW5w=vf;u^8Wks zS_i1Xh}U7t)R0g>ka-;dm$Ka#s2TbW$-h6}*5L(Fkl!4Kw7XV=V1|pAUvq^SC3Agy z%DC-8<*)_1luHst6LvW1fMkDFekuBOD?q-{q8DpbrKN|`98;` zQ$PfqpUv>K-+Q|IN=<4$uU>x!QRNJuYRa>araV};-p(cLIdCfpVQFu*VxlMj;bW~b z%PZqWl$575h2D1jB}A3l&9JzZf|tPzw>?dHywYJFQk3{AoK(yIZ=F|MON>#@M-Not zuEPqqZ|;JDFI<(Kt;Oq2peN10q0ySf&@Hs|QQIms12Cy?I56TVXTu*QL#(|%VElJc z?9ub{Kmwqo$T?uV4@;9+ZANVmzzEytH$jWf57Du-x6SzoQqsY8CFXH{QtUbodIBZ= zjm^97)BLKHw^qZ-$<%I=5g%sYF|8gb*a#(_yR=r<@d;R3A8nL|qxC#wVKl!m2 z7JOV#`FJuTfmZ{%hnr+=o9~4MuOK9ScM)>hvnC(#9fWtk(*4fkwjrlhuOg|yt`bbY zDxAFero@_RinVC+{=yetSnx?f?5KNvepm--QIANq`(~d8rSsC4Rug~uX~?_Y<-~9! zSmVIcR@LsuP#5j4_<*)8lC7497v%b4+f_3mjF2;U%DyHq@s+m;HS z%SUpl*NaG7xR#tl5o(z-RReb?oO+Gy=7z7Sm%GqX)Hh2ccS*RxQy!J}Am9?m zswz4Tq$>^0);{w-7_-~+*o4U>n52d_ys+S4epI@nHI}A93EAit^AM0uTUP$HD%J+G zh*-)~-U|zk2q&kSyXbdtx)27H_45l0-bYGh_OP}qD*?hDEiKU+POLA7($wXw%}$^w zrG8lJ=&wRaI6HwzCy|mc+t1qR(jR{ZP3ajnVdE(v(u*@bO+g$qyUO1~SLm8n+WPZ! zFb-_rvSrH)3%(AgecTr<(Rv0UY^Xg2eZLy%54*2sny<67FecY2SaIo(0!t<}^A`}) z+{=Dp!Ji1HE`3vnZ8WIM`TTsiF7N)S@cdL`I}M*T_yxo;dRIH||FbSq+i{DRm5YdB zE9HCQfOP)?1wXbmdy-4t&u`T8R#sd89wjyP%uMm`a!26uwI+!F(0x-^#cKcf3QE@} z-27{Yg5N=u&*xYD!pr8l?wj(}RNs8Bqm=8|a>I5lA6TdByx#{f=VQP2lcqr7C@h-~ za=bcqFm42YfUGyTR6c9sz5%78^H;3@uA|J1mEvW}L8Zn{s9S7a?AYgNi!>gBDt&>?@T>$`CpI~iRb=WxZZ(Cx(!PAUj@@vEVi}Xhp?Ve zH)$%%{{~9mRq#=!{64*h5Czq416N(Oba`RHKcZ%oeh!Q7Ll``Aa3ALVQ&K2_!WdBp zeSi>tlonhQ=?_t`bboEwJ?x^78zrLuGeQdJio-BPzQe9WN&jD9b=gO~ek>R+zCJc} zXm3sa{~alrZKfQRzvCw;y2M+DXB{>Vv7RhD!vBG&#CN}0#d!)O$2^+#KVd9UgAJu7 zeAL02qtg3dfd{z6siE`?#EG`Q{=$O)4Pl=va4x@MB~@A^zjSsX$GT_43T3M4f1Bs-FNdCbIKZ}q7SL%5wEsC%l zQ4(z zKd2-6k9S|}`GED?YB0=U%3cEE6@+wzZoL*l6A zNL|?bRZx~z3nc5>?t=;Md$ENt5%Nhy8Q0-%ntfeT*j6Tt`t+wk4Lo?4A2I+nL&a=u51&PUAQPQ(r^Tu1-bL6#cL?o1NJk!r@$B-8~rq!v|AO z^?^4L=Eb79hGr<+!S17dr8aH@!)`C%?Vf*|Ww=YKFC*t2gzBqxinv`kU2Rj7;h%#s z%E^}#!yQ1%w6Qj;zW}C^%=*3_Ym%LbVpp|oChiJsn1jQyV{VNg_G#f zv0B54as;W@!EBpi+OaMU?l}6_!QSuU>^t17&WI>rP@nlTSbuVX%!=S*z-H!NREI9+u;sUX zt`3)HExJP@zuz_hE&~c`hpU;-{s&MUcGoG{mHbrPy!+cPEcgzH_qBPTVXh|Pi=Tgm zl92GO@+){vI90QH!oBaqH6xK6*9F6qZDmgV18{!a^~^%)2Z+7VIo6II{thAOar33x zH({l4m{!xz-$RP~=@*%r-xBW0^6NZnf93A))MWe*s7Y|Q-D{oZHbP3VY3Q977W@}5 ztS)u0{J_u%&@#=~>vP+MVt^u7M~StZ`VdN)UZ;G}3*h9zprxES-s=)DyvJ)h{~u8l zbhwNa?hB^`V|6*$Y=L|DW@#k)qU(@}Yy` zs^-5yJ#PG|4pi4`ze6cNRSd6Z(FTOWe?4K|q52cVRL$nM=KLRU#yKfwSs>-7NQys9 zQ!o3&|Ji*(cw)v!-4`ZC2RKUhzmSsCEHxTKN?Cq}s4`QIu$cVe!~aGITl&kv^@6Xq znmZhj$&Uf^1EQ^t)ny@CTFLR-_sdG_$5HgPK|Ee07%rA?qo06!1@Lr;jlts3;zXeb z)WyP|3fk|OR(mgju-$cQU;lJ|n&Ny%<1dAJjCG>v))=p5FD&?(d_awn)-nK>Cu;lY zXNAMiq4tE)ayT_1J8-C$fIrv8;VtF&C(fywVU1XaCr3%0)*gN(9fHCn}N7*}(3*G(QHzPCk(&rm+N;6n) z&j&jA=7u5aKnI6wY!8C!bn}~f%r;?l8XBvMro#dI+xsi;0I9>~c6huU?sbyhJ<4j5 z&!I|dsysUE=!L5RztDZn`37OJGr#Sw7v**dCnbhw);f7WU=&pq%&Sc~*aK!ts%^Fj z_lS(V+5jF;JiHtz_qO{$jJI)An&>2yLeDtTpb5a~3VaGmqK)0@E|QDuy8j_q&hFN= zGp<<>OJX#51W30TrV}XqF@$syet`(75}xm)DTCEd3Gm}k+K(RF47Dp2*as1WptJiIyS0_4&qF5vyBLnI{_l5J>x)^?{`(hgrUrN&Np+vH68)MW} zTNS5~REDD9F4EVN1Zh+EV71q4$nD*2kDX&G&|cUVZ@x1Evn`s*4$a))6+?WKxog z2uX8GolX4(n9|I)dybdjRF)-8OrrjMN9x&#+e1LoKW#$WKIl?#*Hd|}bkVt%H?!+p z-$6>-ZP=gtSyvOuY}5RbS4(^kDV=I&qMihAn}OF5Q@pQ@)%)$e*2VX8^z zA3Tphg~5><2-v&o<(;cvSnzj%avz#f?wbkU{F+q$UKrakdh1*C0|<+F7H|FRA3&ui z1sz$o&Ai(vDb>EQ^0x3_;Os(G-+lhR$Q|@Z)>^g_1~HoYugK}_y+?isz7Nr=;rW5l zD*p{iBh8e}_PySZvj0fE0&EI8~-A#ST%CWg6DA{u3;550hAiw$}?@* z`vlfjuUX)~Y3GHC5Q6yS`BqJyA}CC~g-boi=YBoHAr18amKme$R`CB7xY01Kf&C0j zM;NZ_4U~fXUU#EwP%ijdOWH+T&HH13I!x6ys)fR-vomxF{dkABbF4}q=_3rvhg7mzl&8AYHuq=+}lnou7}i(k}&6W*@th8W2Cz#pVHcvaZK3 zL+N$wxcUBBgr1BwaIIGB&}w;-!K0(4nEc%HkF4!gZUsUboO?(ckJkS$BBf35X4Qr{ zE0c_44Z0h(0K}5112e1xlx0;S~^rz4~BlAV1aD zHIw`bC~-!JQ7c}BQ_(%#OzAei+7Wv1n6E{s9#^ZnPYN3r4F75`a~%{WTk{>N?)>Tb z-^Xebd_AO~dO&$>XbxgFB9-9XN&^tok{&_9$=3kI;GwscWSXlVL9Rz8^rbiQqbjgq zDJeOC5GHP^r%Bqq**DQ5GfK-D46-j(n*!UQaBOC3@<<~MBT4`D@=X4YaEABxEmZgI zUF%i7X5L#ARgmbyGdi6EaR44!1MJ_t8?FHtU$}yGRVFywgxj{c=ZhslQ~Y za-Tp`sk`-bJpuMrFkIL;Q9ASVxIS_cQQhi-(C-MU%;8#ooDxoFGM4sNr~W;ZlzdNZ zrPK&Jjg-P}Wi{&o9US>OTCWF_E*k4hmr~iGmCe>4qR3}p^N{&oI09!wv(WgX4zo4T zY0o9D!#&G=;gr9fk~ngf{CPy2w##ak{8K>Y^^li-?2`)!YWGX_Z~jbBlWvUl!$m0V zJydGOU*yLP6cV!QQV`Kvn%2LE{|89&#|)V(f~m+xa>1L*@=K<2!SKm80{#EL!5uSliRKQiNf3{RiDA##kF5KkPn2cJFT6WAreu4B9Dg2E~65ZFXY+`)rGI<>E=w(`$B5)xh~5 z9cIy_^E`#qv*#Ves^kCcBGFy+i;v*&w7p97zrZx7`-pO-ElVRc9%b0&lD<0Il(Urc1#-Dt(|! zp}oXtdjKJY*gSFwgO|e^ldTgyAmO*GU#<{NMVxe+{=rn8Yl8M*`AQUB!XdqS{$ls@ zoj<7U5|e*N;Rdc59DKa@flY7|KcJs~a;3Gupb|`uNoS3)`q^%+zWXas81x1uy)FM< zMM|-zYX16FFvZ$1izixSZ5M%2sfizy;E)X%$*+S_`N@fC8GITJe@9p$NN+ttx=L+U z%p25i?4sG5D?`gZg!FPgt$GhJ*QU&C=&I5@ZS)J{k8>Fs#~VOe%8b+7njSz)*)8|} zx}GX}6Q$Qt%WH2>a~?!hg`Q#ZHYio#aa_v7aCl^Q!-1Acypt%ld#D|T!**CL<3Ol3 zRzHUz#i?m)9~%QuhSdMh*!{>u^39WmO`Mr&=1YqVnwZjHuuMB~)xYNN%jBO)S2L|hRO5fKp) z5fKr&uV=kK`{T^b{o`&b`MlrX&u9I3)~{zh>sf1Y`ktFHTSZ=nOb$t|oOe3)=6ZZ} z1lvn|n}Fm>yX%(xNAiMYbtxGGlGx>KJJ(jY>%yG**lbe--&Lt662L_B8=MSq zL5B9SqyKvS=!}`26ioLNC3K|i*q%joaXQt4(f}ud6*_+r>_$|@-Cnksz5TB>#-;~)I8$2ogP@ol7;FEK?;nD@?!1oGDQjOlBJ$z>vkkTI@+g>c zweCH|&zXIoe`Bb;VIB)~)o;_c9tR^MGplOqPa?FORW*n`24Pic7( zOm6!7-d;kbQq^uh3Ir@f?X65FuRtl%!8%W;W#}L-iMYaTFG774A%~0qt+Nwf^PJ$U z^Sco3^&T&ooByvg23Uf|XSZ6Gq%?pnwssxEq5Q~@+g$hF4pbkulqv5Wuet?aHER(QSumsQ(!nrE%V?=ce z?WW&PfUeTCbunvW?Lgm8@q(?Z$C|5}6@-M-R%+d%M}0s@>gDCS_l4)=WWX|Oa~1wF z@5`%dRKv(uhzN3EuFk3d`X}6$D8A|OvUcz1w{ZHpI$>9*RVLo=ki=W7)TH%2RI+c< zqb7h0RgUHTfXT;SQx)>b7$;Sq9QNyl40qR}ceVU*d{@iknRNsrf$=CB<$}m{hR%Q$ zqt?~TAxD9dBa?M`($UburdK2DbIlK-&Lc6#9^*IRUse;8wvqH$M3>lqE<2(G6Y6o; z2x&)KNjn}EL3gzO2?<1W_7GLwKCyqDi-!*9BshUu|DN8{Et8*&RpP8`BzOu?#dg_R zN<{yH#wt|h)%<%O085WYpS>?0qkwIr_+H*YKi;UG}1E=^n-un#WOu7`LlNS zax;KaFRYdrcD!;<{|e<&Kd~4V95t@iGKt_^L}j^_l;{;u+H0MQkn=Lm$0>wz7^oV4 z0hmayZWZZ5ScpswGn|FBh>H*@r9a(Bu?U*6;v%z`K*A!@$x)}^T?$s?FsZYi(1Etg zY&j6An1`qiGzd)M<$|n#UIj-mWr>#d)d-@#s+@5K`X4EfCDXM%vBYX6L99cFdTL~9 zH){frY_H=3m-F(5a7pT#GQw^E(>~h0oawBuK_*Qcqowg$I0Y~>x}`R{(;-l#;n=aN z4%b0Rw3d5=4)*%|Od!h!s0~o!+*T&95m4}^kcn=r|9bur>kW(wh)Ahj5J33bdgjo> zdZd3(Yrs?=w~G`eJxk2YxBzJlP*T6mt`#Z+oa+46udO51@z?uV8?Xd5enV+QJAovy z=^eZCTbx+-0u|9tLEY#%g4pDOqc-^8luw-((?;8yL6O(>Ty4ev7MOf>wz-<6_+K^SuKw+jwzul;9`Bf& zf0m*UY$w!gJDK@jINkK>;jPnK6(luXEcf9xz2#T1DR_+J<*seteE=hUHst@3wg4pm zOX%m#t^OfYQe`tn+07qDM3!61kN(lX@ovAsJ}`wdF}#?yM6e$#{_PZhsm|t z{3I%Ymyd`nKZPKNUh?`VTC$*IMech-B!{qk7TpzQ^jjUP2}%ZC!$%LSeuqVn;-BK-s)LZW)1t!+1j_|uST{*vwG z*3n#CK0`|T+>CzlbI;PNhi@w27huxc7Ja&z)0f!v(X}=Gt9&BNtEcqJU*|`+T-D;- zH&APL<*NNHkOCZK>X~Z$ExyawH9pk3!1sB7{DxZ3_yI^Mm>QQt_#+|_|BP%8yKOsRPRQrP!z4%x2&|?3a#cSGOcv?ywGMtV zBF(dIr)Mal1%w)*h4b8N{_LlsRD${6*7V(1r@<=0uT=3x`AK}MZEmud0H@|s{l33;ZLh@|wqyga(O zu4mvz01AkS5x5|~321xWoN^(Ud@j!!7>54-4ORqWKiJ&@RN97lQtd9Ip>cpIE$tG& zXS-oZim8|0S>m7=_$37}qd6{xdcst* zG=Xank!)?ft8lNyB+XZpe!dZu!=bJA4blss$dZ5XaPzadKJWXs_3)d(B&+I=bLfOI zf|IWs+S1GzkcvFIyI)1xiWJL~ zI}pixE!Wc$^1k`gTvdu$P<1j{r5$*di*8M&4ePs6s`Vb-%3hCYxj*eBpebObaG0Ty z9BzV=i8=ZEmOCGN#FK15Zwkx@>@oCwPVR|$8@7kPPJF{ z?!bw5lk-P(Q+GnuUfQKkclEC?se!)@4tL`csL4Z{=ik#arh(Qk2X{VRM|a{=lDsQ8Q2(uOVeHPtQ)(Sv?Sl%}N5)JXXdh8!-p&|SN*9tH&- z3#$(v4IW%En3IF{!Ieek7m|G}XzTpGIuP}7NQUQHHF^?IMArv z@H9r$+lP0!Se*8d&mD~_tY-o7XhHvqzW;pQH2Apal+FOAK(p&(v|M3cK&C*bdu2DV z)O{e|EnI8=BB)v|sWIav&q=IYz6>O7mbmrT)CElBK0E(gWjVFr}^*Wpz7Wftm4>J5ba*iBf9(wk7?+oHJ?b11)C`luTFHk3YWcqI2!a13fy z{~fI(g=NPm}u;CeghIf9;#k0vrZJ8lMB=EzyYL3rLD= ze6}XxSxdf%cKcPnaE6@H`8q)Sr7QqRb}2C5LXq8!g|8ZmzRU0Bd+5{egA8ibSbl&c z%E?_m{Sl}-StCW83LN%VE#&S%r^BIS$Puf;JtDu3Z)yG{M?#T|?GA18ko5+f&|mw_ z!M~&*0I3pf>*FzCs=(GgBh3R5A>r=((fPJM=s1kr|M{hq1CNP&P1|U80-V4GOS?M} z?D}Ch`E@QtC*_^_7jufm$$?jyan#b*DPU!`t_J=ewIlgdM50*PF7h}H7N-41X1ct7 z+s%WEkdk!8APvD|DwFkC(_;cWBfr*0pEEu7D(mkSIcCqJOFF7xb^T-dq`9*fVRBF_crSyWlZBp;2r9g^}TV;)@%MmGwwY9>t5{wj2Z!^eM!5gNg zb3eptI3?wKH+VJ$XaFhDnq27pjelzq)r?#3Og#W)cJcMB1|E}ITb1K2K7@)0*K}4i zx&bb2a>6gPHz1sqI zIN6730g#S5+~f|A^3DFDPp=p6M5KV4s}J+pU6^Ft`S(*h>R^n!dunLPgtq$8dr;&# z|J=xU`J~?qMxM2+4sjoX>{Smsg69-DL#XlU0XT6EZr)1$rx{?Ra8RfGkVgT}SURhb z^5OnnTm73``lFbHcId(K-`JNAh_VeikM$q0fw%hqI7U)7Z=)7HnUCu{kJhg~73^cH z-Ri~DFo|aG&NF~it-72>(7#t!L^=UnVb(ln|4+ZoKS3pWA;0aYYJ9-6@aotF1_VIi z?H%U}MhD1G^DmbB)XPBEQPoDVc4NjXd13X6Wkbe)K+D#Jxi3^58L0wkw( zHRE^}Og}T}hTW|-4K@BF)q(ErlHVEq5ec~+gXNVEFi|a9>**#?r3rqRk43;qvy$}@ zqAE~H@ng?ow?oGD*{ld)gf%nQHt=fwpXL?cPgaHc3`~7AO{jYQIYL@i)P0=kPreA# zu3KO{5zUwVN2WYg%3mQe%q+9BYLpgRRr_mfGQVtV&F;3>#N`{LByJxcY1@3i1r_y_ zB{Bw#?;xq+2VGWAZ-0-7gz4fma*gzdydwin4=?YB9}$Vh^r%kZtjX+Qw=ZN}KSxsa z;^8=w8Z3Lm5uTIcvUvH|EkAAia!W{N*sStiqSF_{y7$6-;?5(Rgi=w-T$6XJ`kE3P7arEAyPyJsQyC4dg{9<{PBf=BIJX zAxdW*q;v+giOS3R{5WU2t6+y9Nk3bF8$2cjR)I8>Sk14=>qd!M^SBn0z2-}|F~4X6 zt5#49`mYTs&Z11Ky&fAuu5Xgs1gH8r;i_FpIf9Wxms={QF(8d-aMa+VX=&$Mk>VtI zdV%MJclB^>9GZY7|L12?2|T8vUp{yxg#b)YbX85ZY0A?`5-hDkt(t{W9&%Ss6Fd8F zRLBxQrD=D*;l@~tWqTos*;Es`8$Bn+He31)if+?wG4vSi3pr zgXk{BJQ65g`{@X{gsP8g&cqL6B9`GjZI$p*L^4$?kd`4S3jJ5kBO5RC|6`cQZs+_H zGWs~2jP`n`JUod_*bdh!1M5>E^qn;z{Ufabq0(GE#E#Iv+`KVq4v56Tcbo?HJe>Ue zWS!XGzf;E^(I8+VtTRBV5I~|gg{?~UA|hGaWdXFcf|vRLVjy@KCfDII6~2<^X4p8` zf10B)h4yOy>09oEr$@kvZu^vxi8TR{dS7QWjTtI^^#(Gz;2gO%7V9^0Qa!hQld5+J zNNKVlyK#y#SKZv(=!AYnZft$0Cpa;drtvN!WxI1~PjmEr52IDoMGV#Zybng8OUJ7L zeb9%nN`nx`hk0QcgS&Q0lYo=f!OgXywYQvH~fc%G&mkf6oX-d;M5}W>LQ_Z1{0wc)o0!PJlG)}5FO)afe=VPEqhGt)TVWY94v@yWGLvzgU~9;)haM=S=PDpkFol;HL6%qJNTV#8HH!_1lzJWI+?20tP_0;+u~R6r|?~eC69?(H@)7oM9e=j;ansx3Sjc)8VC-rmA8?+qAJYP4*Rhl z4%OLHTfwA>8a8gmO^@gUx!P5UH}DmrOW@?CIi_))rBNUV{NGV3=@UTmzS6|CI;v?* z>ZhKKHW8RRn%=UN^oV?FsYeaF4IpE&+L|I~x98PeqwSc5JD^M{`ojdf!TV0XB##^B zpJ?k`cVUu?&Q{Wf@4K;bVJ@Y_@994(SFqBa@5LBj*Zdi~LjU8`$PLYn;C@UpJvctP z)!PsBueg6;yvCFVG0Esm4Z#luF6`BOxDS9Sa<5`~6d^UdKo#shV4-%jQ*AMHeo&qC^rQ8{6#{M)$G*pOxqBjJbbyjV=c{X6P z6Za_$fc#mVYW8pTh7i?FIWp55FCfwa#)hk)5A-ig&p%&X&x?q(#xj92TRD}iI)In* z;wum5KV}GoMC#t#o(G@4f>9~vACIFWB*}Z*81^a<$!ObUlVA#a4X315te`h|)Z(_6 zS^N#p$-JuAy2m#$3$?ipv>|#qIfP9W955|=8xq*R{u%p2A3*7?W&3y+)0IoBY_-7j z9!~Ncp- z(W+*An&0bCIF;%%DCyz6H9I|5+hRWNzj3f?TZj4rqo|Cr}|AnWiK_Vw-MVJ>boqd8WHucqTrMo)bFIBp2?fC>C z;nTpw1qw}bL!@-c@jp%Ua7g}$xxD2Hzv~*U{aPQyi- zD{AFkcv&hH6JKGiW?fFLoYsF*J7f89iH z7T`sQB%RK;v!cBiBT6%`E3eNdXP1J~sN-2NNPB?Ng4Y|2s4A_8SdJCtip%PT z#+CVC{zW@E9Hk~N9uRt;ySRj+!$hNC{^k%KO_T_wRPcc(Cm^udzx1k=aOvfy9XTs zmy3#Q-P?b5)zC7^1DFQQE?%voD*gR=r+n)oGY|IPyTgUz04DsgI=tWN zMF0Cp?R9-LaL!>GHFMt=s6Xe_Mr-K5c0qMj*vETf$kt@UlL(b@-8$-mM}nU;2bYhh zp=5tuEh;|Ke>jozEuYQXGdU^ac_`UjIC0fC`;k?p0CA@8ftm#wESK%)JnDdUSC zGdPUa-lZ}qy@b+Bj+binvS)#8Zc8(-o#XOc=kJT^{r;m>&Cc$l4|*z#PQLsQB|#>JP4{M1=!0^lb7%Yf7$>r^ zDL%lKDIj9qEWl6m`18~02_9APHn5ee<^N#7`s*?xeeQ>As+gJIhFwbz_;>i?~swNN@b`v{0%0O-^3U~iGU*#YoE19&SUC3 zTvy%d)Qj&yDbr@W{{tY8l@1N$N2tH8r1KrEx!zm%(j_b?}g?Vhgs2ljIhdB;RRyWv% zN`dGxOC)?k|GfdGD%pvML_rcx+_X&5f3l%TNRh{XXJan}$Rabj9 zn83z(RMb@Y98Bu^jD1Fnp@g==+SPQ8H|Hh&I7xmucP^z1LOHPmG&ae02g=^ z9KVM;W9&jG(XAQgp-Wm9^~@G#hB~hCVvO9(IKjqZ1<(~~8?j1WNJ+ zHQKF3$mE!8t@HT59(|(nuFcDQI%5}CJ|d)L_&K?@#@Q~>9KuAb#sF1Sa2xuUtZ-1p zc|!Crl@++@jn|@*2pwzn{u>c;!dJ?XuM`eY=G*SeszKMoDa^fI-UL-WC==O;=Y(W4 zP*wY}e7cDfxS@yAkx%UqtTAdEB(ocTMc>GmorkWJPa?VsS$kX6svVe=nw9^uZ3tl+ zC#9;*OzCm6{YOhC_q3BecVZ%wTFz>>sP0Ck0c6_BxFf#3eMswD~}sT>}w&gO*J+o4n_ckhtm9k2*=;yW2> zHHbU&x}2>xD&CdXMQvJd;&eBh=*^Vu;HtWH501pz7SY@zac{z)mQ~5$2a%Qgbeo2E zKPUwbc!>M=1Nm8mYpQ2`u!n8gJ1G?ikTMgHePvewXN{wolLh zH?|3^M7Zj z4plArz5Wf_3_nLr_E^4HZ>(K7zaU)NU{%02TUU zY)XC1f_D=12~IxOueglf;W>?EXFI@7K%b$K+htqT`Om?u(-?5FVp0Krkw7MD+whn9 zqUM05Ab$lebh&EUBr~n`3?$i;!cC4$`6kG@MJ0}J``wj(XL> z3hRgd{b|#wAEDICwR(l>&89{%)(CaLn>b# z&8!Zb7=qQHJR+Z`SRfR(O_!>}$$&(#0@t{D3Y4OL>(@kHtF#Ab6bXTZN2Z&zl1xK_fK10+hV% z^G{at7Wc0^AMA$aKZ!|7)>I8@v#RrOiDab>XKWJ*qqd#unJ$3H$JF6>@@Lq&5RqhZ z{+1glzX;XWcsF`QvKM2O^SYXxUgB9YOW&;1x-SI2qyD08WnPuOlYNj;S3r8C;uR?c7xT)^(scAFU7AJXMF5 z4*{a0<=5S`RsI%PHz1{)bNBosO5_?axpztf0}$$3R9BLYxT=}V#(dM%rf2C3U@GpS zGieJDvAF+(uF(H7{{U^FM|A_)7?cY9v!7{_&u!6J+y$f(neTH4=2}f>K&DtO{$tug z-giZ*3VkmS*-zGr-+dshld{@B1SaQo{ABeX z4`Y%}ih1>hk0KJFiH%CQum7ZG&!x^kh9Mox`SgVTZL5ENdlF7UudIgnRR1&0pAepg ziD;F3 zwdxo9Kx$t)qXb+{d$`U(c^M$dS@FM;&z7*il#YE6B$)LzF=#WeS3}rnQOdD5UPF)+ z*Pxuz`8u2oxZp32`){C#%64JOf#($OjjR}lpp?XF`emi_ZA?mH(9x~Z_6{QX*jy$1 zZvVy1=#Y>aM?jj*!YNA*2GIOJ#wUq$ z8uTYn612u%dsX4n{JeYnM76Qcpi~t5I2HZpaEii(g>$jye}PLD_72mDz62BEQbsV9 z?W;ZvMt5cKHC(Aw)u;kMr_I$F1^-^B(p+Y~#aFg%DpH)^K?!p^Qvo@CkBDS1liAwo z^#evJ*nCVU@GRB3i}EX>5&X?U3Z2427XT%IbEKe;0KCiVBLm2haM#1J_z`ruYim_U z;ZyJ{$0uu{nQnlRab5Rc*2aNjK*8>9gXpn9dXxbtyv_gp{O?K)Iu4s;voK85^5O9q z!5Y%~Gk#7$C9}0@ga&+KPiTK-xHt)sQd!bA#h(m|*?EREwo`zV@5(Ew1wq?f6h>^P zL4q!?PpOvXMTmqnH(8Q99hRj!%c6341}Nagi)z*JOdt}VDmYO_cxR*Ja)d#vq}Loz z=mYu63EisOVjx*|VP9)w=VFpShv(MP+j$5@*WA<2_nZW+AFUK$0H?y$;V@1XRA(>5 z$)R0AwP=1(&>Zit)rXT9deiEedEOC%jf`5D(})V953%*uiM$G z^H_-?g84_vxV0)^dppu{HISq-`H5-(PEZbeH~xdE0adT5ApcO+XdSj26!tijji-lJ zzrO!+u;37wG}kp2b-K`o{)MZCR?-K+#Jrq81!VxPH2${M7d8fLYBm45fDtRQlmK8s z7HLMCFe!?fdU$0dL`>3kDaRNh&Ee|*X)DgwytuH#m1clYh$H2mJK?$NV>=vU5=aR; z8PFPIbt^l1ZjHm!q?Bx@aY>!UqGn2&#Sm!ilr6n~Cn$zx1Bz|8XE8W9N*ed(hdp&I z`;Ga*@r%-N6Oa-atJ->V4+qD0vlR63s+vLF3ijpI9j?M)fQ_h!ED9R`VTssW+aK>p z02UYKN^iLnO#A3I{!sx?$y1#iQnT5+^GzG8>9zNO)gbF^(PwM7!o7&9+qJ|<2Y}4J zYB>5si{kz0B+3jxrFsBP%tqC?7e9!q;;2FAAdcH_r^yiNzvVqW~| z=dHudvGgTOYI3(HVcI&ASYF1b+%0vPX~!A#l>|1>Y?}vx$mX))8d_iNKWTmq&y`9^ zRp?n|oii}V@pWvP*jR3gf1?je3Ab$&dK;I#>c`t1k?&v< zs^!S)A>Ivfd3lSs(}CaX-&iv`Qd->mA=YqmW}8qi;B&zGA1Ax>h4L+R@0z;Ua0)$Jyqbe>^I+r+H@U>Q3lCWL>U7NUl!ANwQ;bmDfoD?r${Q zp4@-$aGA+EJNy)k#y(xMvs3$DYx{7C`7{jiwRxEr7J;%~x2m=s;-|w(`Id5HzdC|qN)CkYQG(@+Jl^Z|}$5Kht{<4~V4Kgj&D7&44e=Q>MyJ(KJGC+q2l=ZsO7CZG2-N2-FPE!BeE@?~W3$%8r1kAC0E0`-kKO_0Ij2cBivxm$;5jpooCQj>dEo~b0VRBSc2CraNfTUhOYz^JG z4tX{ZzG+vJp-8dVQp&y9cU-@U_etvScmJVPHNTH3{ z`h(b-U*p@o4d@08P6Dp>Fzll;itAQO9<7ght~`Uk=OpgX15tnWVUoBT=J)TN z-(Rj}kKvL(4uhokc>hlIrsMO^iT}xP|I?+;PvNAAJD$acK8%^s`F}9}!x5uBg}Jx; z*}T4|YUlHwWuf-V8vlW0VdrQ&!T$wBq;+-4=Kz@W+2zc#{UV%Pad2u|&R*)@h!*)W zEM;5j>Y`UXC-m}^6!$?yH>WM<@aEz5)?dH%YxV@cC>C|Aw2jxGM5_dd<@Fxd>K}gu zH~|mWt>Lr)gjiQzPQ^LoIn8fI8s6?-RY}Ga(mROG!KyCSEG?v+MwQvXyMA40Z`NPj zu~EIrd)SEdvRdY@G4_3w7#-$MC-9sYYpLo(XhD&6h|*@INGS-(9v$`*Pw6yepnLG$A} zrVpdlx3RXt;aFrOTDw-L3JGPYlA+BU--lAJ9h++<_5@V2Qg?p45m5a)5vQ~jCWXN> zAzQC#2Opf=zh1q_)s#Zk0i2v!Fnx}m0HpJ@@3HB|>BLXNC&!tdw4QwtGDSucDs}L5 zM3S%9P%LM_QZltkA9JSXipIJel_5WDnN&~C$wzL1pKU8>i{S;~>ga0AA6G|YH$gNV z)<@a(NNK?8pnvwSXbc`@)JU&^&=2RDI`0mpQU`4=DFQ1vfOhznq{Wbbv)NfOa!s6tv#&j-(cEr$HlDf z-`H$5li~m;!F4;fnksE;agrmq&2NhCkdJDz)m)6$!)Zdx%Y|VGl-k*;)?IGM*JUTm z2B>TD^Rg?iT02B>$j_?lQma&I*G5E&zIJ3aOKG#CT$dNyF8*ejy&lo^+2q*#hBx(r zRrOF_BR!#4sZMVUA*oA8$C|pg73>7ms`fTmF*t)&K}`ViF~7g9_)S7T)raYw%^I_# zCml&YI_ZKHIhf9OYq$fF-Yk?-X_h+r$xckfHCuahcK0vTvdCM1MVY`TV9r<6h&+$i z28)~e565b@esd3Z?UA)xz~se2%sTvA;nX7INZG<~L&(pvobYfvlyvQwbJd>!?&v?5 zooh#M+?fxCZG(`KyYh28zP6Rf-F*nP!?lLudoU@I;khkY7`+$Owb;&$SxW9h5;LQ& zGP=KiV`Ox0_xSwd)z3YEl=yUu)wMo|NX6Xl$SR5+?jamOE-YR=49ef&4)=CbCK5s! zfHP^{m$&)#HN)0pAR(W%(Te-y`AH>QUi)&N1k=j@@F3c!`j>20eqqyp|yW?1XFxtlU12Nhyu)t~=`X!;*2;+T{!ho{440aP8AQ0LYEQcguG0V*lCp zc0$NYaO%+i_Q$_qOXz=gn!18`1s0Nx+@d%LRQvra!?kUi6bD3_*ow(Izt%~+E_7{=&0B+@<(Lol^KEm{{uIubNwZg^Bbm z@t2pm)1ykNUnh{cI*It31Ty~!w=07EHc(22N^AHoZ?fr93p^)JD}46@l-5$`xR~D6 zHrF4K5%+AJD%VVhhyBNv;;HJV4)-j@wEwnRe*_r!K?{KW2`X}SWWHHhn_!RfoG>pN zU)Wf5G)}eGxlZpHbgEQNqC5th1pY!xITlK%GDu%gCdVO^-4=;IK5shhtCoOIfMS=k za?|{jTf$H51913M&38{iq$aPbBP~t_6~!yhoKASo~cIIVw+ zfw(!5Ey4(rHqiR~(*sF=^OP{o2%OYOR=0L0D8jSO`V0Di=Tzb?9JA>Ja98*)RjDp# zasS4l-;{;=+(*6~SCQYD?(aVv_=}eu) zN{@oy^VIo}SCcPTWXZsMAE0h{)cJ?1oNZ{T58@1k9>f+v>*=hlDk}d#hlQFJ}sr_2;%k zdCtc-{~enHkb=)dXUd&0s@)xEGT%0;jos-vbtS#TT|Hj1y7m>^4M&tR*$z#*2a_o4 zfFRZDUIfw2bhq2z7bsgB%2(w6K7btmMq}t17lgJM@Pj?$oaxpRJcQ8*HvW>`!DCWb zn>MHmFgemew~KrqM8s3=5WdHNBzCDX8P-WXjuFmg=94Eqs^W%$wyXTDJ8S#)Q^>AL zTelP5(+G8r?onIwUZ~OO8I=6dfwWPY0+>kt&78i+H_!J>Rdn|*3zD2Mzr7Ag4dYrC^WK2dZViv61sEdV#3hqTq&Z$4>X|aK*1-HWhFt7mXuBTX z2^2}Q6TS;3&uiw&IQU+Eu9J~VgLxlR2d`ZJXDke!m7z;5rS3zZi61qeG{qpioA)e3rH+X`p@}I*gbLV+g75xH{2-djfw&nIq zlq5OKLy%tq>2#LzU^X>2gQrDcWnX@+#ZV-`v^6RaJ;-qhrOJPcmd{0X(Z_e5Mch^^ zn-k>s2x+hVa&1}lhrA%j9iwgL^dlm|sDn(Zu^o2L0>jUz%|Q>xNb39(?bPW0XQkBc z@i-EbWZqCVv7^9bey3}A1$Z<<8tmZ9#Qhk!YL4@cW%AhkDn%@2YVL78w)ad$J-){S zqvbz(Lh$V9_(Yr9oCwSQnScH>`h(|W#tkA=sbP0soED5;OiBfp#tnwNH&c}+pEgw8ffdHl7t`z}a2naZ>C+D|b zl;7(>Zt6q6CF-U^ah92%K28aG6gGS|XThhRI6hB%ZX?usbLgKqS*PE|-U=x;@$|{&HaJy%?%h2$ z6=SHrC%6m%O%J{oPJj~*IjWxHK1}3x=#kb#wp&u~$4a#RwE2)e047^kPPk*Brn3+B z-;GU8&AOYbIi)^?P5A3ww=^LI4`U)i{d1jV@F;?S_wH%CDS_y}-LV;^AU}rG$Gq}S zXHg{bcD&@D1SF%K(O#aM>g7|Y2w<<7A3Xx@GC1WxKMLbBnCfQY*6=Kp>|V`|xv=E) zJXUsV@?TT){b0tFg&U{a`K2$Q6BiF1MR@=gGVii-$bJ#(%1Q&TG3X_Xd@NhGf+pcP zVyHDb34aBVYGK1BT>><+*3P?E^GmIDmCw&>piqAG7iUr-JSP-A!u;Q}I`qUfOz_3^Ybw(*>9$^Pr$AgKF6l{4>=Fv zQ*5Gjmt|)z`wSNm@DZv0`*TDBU)oM8my$0~3xR`Q_MeP1jL}`b>Iu4Wt#W_ehfxdR z&0O;hN^T5vroG?h19q{_r%8PuWY1Lz>jy~eY?T&*7$?9k^UkPjCa%N& z(?Y(+=xn*K=5U;^)|Lm3hJ6H>Y;rQKb;%r_J zo@V36`XzE+PUUqwWpnd94%=CjZAR~MsWUzvUpc9REca6~z|?Tn>vwf>z=@cyUevam zmbsH~iDF%yCu{FSwZfCJ5~!0MQX;27$;<>hXtS-Jic-y6UQhF!@>@~|(=F=Z<(Cgq zJ3u9~^2+sVs2%yt@o#jFyhoF6XQG`ACb{ikDHZ)3Olp7bM%r8TcrjAS4*pF$+vnUM z=`4NsJV-)H_o>0{e6VYNb559|Xyk2IgVap(LMZ)1CZ-p`Nx-Z_?kj+c^FAwCp-cMT zIs7%9^3tAht$k}JOEHP^3N29{mLno^9>=wMud=Mf$pYtw8WvYU>8m2?W9AOz9nfw6Emu)8sY{ zC%@a71IxxTi;>!$<;%OX|8=bm!8K6t&i6u}9^FRLd*Po}sY~^u__g~5Z|uLFncE(n z0+S|0YpHeQ=W7Gs%~;8s-&g*mw*YA(T<2*Ofu-WqN&UBha=u~7z*Q^g6#1|!F_j{3 zswobcpicz0lWnE?u6#KEh}9d~1eA1G@0qfc-^>{IV5P(L!&auJOkX0VLIHYdvd!phRN- zUh*YW*SHP;x&+p_m-Cv6q|C^Hm|j68XTx+%$^#t9s_VPN^D08}oX+R%*MLr_Wj}r$ zl%D1s^M+^jMA*x;mp7p_{2jyHw(mozWZo&~6MJ^G4GVAg?`>FE3w{R`vAM~MafuFr zNg7#gwU+n#kjmCq-o=!l_mOE7e|Y6p=}sT`r7Nte({1+P_nIj8D{}tU-dDSih3I$d)9Mnz1zUV({GYT2| zvS${YVZZ8wX~#`+K&>yoPB20>G<^f4gsxPID$uw2wNBtRA=4<<9N`DxoM zHj~v4`NB9WFF)psHO-}ob>iL@Kw0{#85|y{SZ8W#egvo}%Ez{Ci#rmQn07NmYv)IW z@c!snTg5*b5$U(9``XgwF*voO^`9?NZpimolz?eHS;{;PR<%}4E0N>#JilKjaf0XM zej+#eoQO#HgDSMDFVQFAbliW;+wx?OetVpK<3Cl&vH4!-tk=2|s z0BPw9^Xo-^NDZvJJn0Rv2!8%2v<8o<*6lo__=cX@zrSO&&fhp2PJOAv*Q-vSgOLzd zZj_cG1Xa72i`-Jnos2DOblbcjBfLBlXvUy4YsH?-65miD*~#iT#Ll07}wmWEbB zU9#&4iOSci{sRhuZhSSYFiPR8QD`9FQ>p48@U{7|ylqNA>p+>RWwopjtVbmL0rtIS zFBn24fdea64pAH6^g+9fYu7*u=@*plYduE(yu#YyEF1IVC1+3-JVt!R+uw8YzCHwN zdU^G$Pn-HTOk!$vVgx~&tcbuEkc2I%HIAA{Y(*u$TDhz7XImeHI#ms8B0=bY{5lDw z)!H4%00Ab%Wqdr_Ub<947oAv@EV+U^A zB>|~Cj#y%J$n#v?`+ajh*}kU^AG-xe3_F~@VP9zVrMDu}LUgmCJoxyiP;+`q%(p{b-`OA3sA?`**4p+?k@*Xf^)Co5qpL_ck2FkVl zK9HcaRtCmgPjG+U7~i|MoqzuTm@2|V>x&2BRFN8AAIi&mtDi0Y1tkHLHq&3W1<%Q8 z>)d|20&|}q65?ge9sRK$Q@DirINX(G?Gt(uOoW#$r6lCX8#w+~NA)z=)yuX!n~4D< z30GgfoMzDf+E&@0rygJgw!@OWYO){dQqnfQz5uK6gKKFA9tE>x%{t0K|Ju^2a>RNG zPNbJveXb!rxJAhVO%!jgC)R^Q%FitrLHMPn1`)Ea@SQs~_T$PD;5RO&lMgYE!Q`k@YH`EMb`mD!Sv}3Y9hh>|yL-ojC1X+hIk$wq9-dBw7 ztQo{t2yxF%*2=`!K;j;Dtt%a%$IFIiN(IQvXErzX?|@{)EidCEWz_y2m9$b3n=9fE zn4J39ZYQ^q#W-9mN5Q~h{52;$YN484id?=+)HFX}B61xBuLAsWZb`Z!9ZL}5_F6w_> zSp{}6D3x35Y`aT5SK&1Wp-X|}_69l(&xhbP{8W8j4yOiBjN66Elz@mhHd}wJvwT-! z((JZOxwq2=OmqUA)Q=kg1UisFnwdzmrWGJ0p-%6k6XaLj^=vY`9+4sKtN40ui!t|;|>2UwA| zBPOST6vEgggO9{!ub6EMJ34d@C~&5mpoUAxYJMi zUWVj?%173(09|7$AJ~InSJ@_1$5LeJ0{uI-p3w%t)IDvu8&6)(E2SF_t)dY0!IWaD zebfl`CNgrEasFy4$cOrnEF?^4viCMFGIy+@$lnRBKC^8Ed$<2)i0;8&tZBZ7lcVxL zqYwZQ?(hUvr0yX9044v!lg-`oLr9h_-8b{`M}c~rCAGKkV=(bcx-@>$V}~;;nNRb2 zb9*(Aeg=wd!ylYA`Fa1vW)8Tt066lhGwE{p?Uy)ZxaeQd0z5`6W22gkD8I%i#wDd^ zf8#m1VcXo%W**<7q@O+EudCUA2g>|-JN5PZeA;#={@}SXADyHf`|4ooj$pKi@xjz@H*IOdk)rU#&iW%22!(*gSTxkuKRei9;S zSUqQ+Vaz-k6R{KA@cciN>H8F1YWtN}$?vIPbVq8VV;#$YP?pTo@JS6fvzi7kLZnh0 z+E+3^9hN@_ikDh>1{Af0vyKxyGgCXW|7Nr6BIyHg(&OTmg)^FPvb5+=`JF8GEX?_R zW`8UMP*S>Mk0{Oy)X}%oJ!9TK2G;G_PCy+_p@Jy*vldw$IDKJ!o z{>N>jyGURfPReIS`6YOLwrAKc+pgiA7=qEWt@$(R0hIbTFuY6V$_-^NP81vj1#n|t zFBc}&^(LsBe0As2y!C5J0+uk^tx(fC826qCH!9a6^xvR&!8onFu>SYcjx(AWcNqG^~y(p0#{F}2vY=`dole-cPMn|V5yYW>ttq14qft+<{J$9cb& z`(s6Y1{Z0%e~;n-MnFrL{|VrEgl02dt4jMlchYTMd@sPMhRI!<{2j;#j*+EAcurQt z*qvGRQvZ<&3J>{a1%DZtRIDzCvsb{@{TS#e6_CMh`2ba-e{W>|w^WJ1!EU$S>tO2K zxum2gVOYIkC!$#=TBfI zlCh=qpidD=q)!9DtPduGrST z&WGE}GVzUPf?85lo%JB!x${gN`3^{oONaNgJbN5a-Nn+YER(A}{3noFjTneln%IXiO6wSf|!4#G=t-Tw8$+ zIv?ee^My2RpVIPA0TsgHT3tTXv&jB;f0P3tLDkWt)!!}3tCQt`e)>o1ak; z0;jh7M*hwRyFJGDu>Aai^{4DV7xdp+Tc<97$rH2K4dtSG5k>-QL7lz;L{`qTs8ew- z$>&YAWEHymV^&L%tx9!98m7dQjE061}#KtW300+EoL z5Y^|>3)@QE&9IE@8bA1Jz1c00giMvY=B@d(xf#$QI2s`2;o#qu0(d)+b}>1+i-s;z z)&eB))?!4#J3*l>S-Om!z;lwf#%Q=z%d0o;#uD+LTLHSqql7Fe?S~cs2}!!Iu3q{+ zFx}J@ZOpzO?y77Z)6hwW##%}q^}d#q;|AC2SmfP6!H$gaC5;iPQblvhg-%TswrW>z)({?o9`HBDPB zd?r7qf7Itb3wMd;ep7}0JVu~@U?Qgt;NrK~ZJbVvdI4P(ZL;qFInci|K51RF{OW6r zdl9W%c9sGDCC@3xrK3CN_ea5f8JQGVwb@yscm4ZXLm*$3)<3`6|2k9WbG`;9 z%<^jNE=_wKsi@bLeZiwLa5hTKmEH^@ow^YI5R`_jieyv%+o%NR5;Ib6n^)dJCW-S; zn4rk)yO>D6tQMo=rG~wSOS1W@GcJG&s}1U3Tjj65>Vv$d9X02H55bb5F_VG-BqGN$ zHd%b!{k*eQIX=m!QopJE$3N}iL@CRk1*&ar$HV8KMr%Ib-ADBWBK4XNjL^P>6It!U zDNE&7`DCQ`B>%eqqzaU#04Iy1TU4fRLGtTvb!%GrcPQBzo+uxl?*X!}3zk>V7NEWz zJ)^HKI|ELd>d?v7Ngj6pLW%4gb$m|gBZv1?b@p@RSbYRaGC1lFR~0)FOhyK|b5-wt z6h<7(Rus#w;cR+SvZdYlZ>`Gr)|+YHm?wDlk;(40?a;BY6B)U#gv^1(5QI= zD&m|nZKXHB373m*#)`OfjKWNFAp>y4f5_S{oo37u{Bt5bnQdu44K*A<loSjS52O#lZ)~3~G!0Gge_N7DrJqqfXKZVd-55Q^|(tl2VaeuOSTw8DidoFRhg%2+*}7Fea5pamaIo5&?~IUurhez9CgdxK^7k{ zNzDds-n{6j5Gbi-QPCNV0I50c)VHx7wo|dTg30OZoD+VnJ|Mbo&oWne^H2yTkOKVu zE!EFX0`jU;*KJApj{G>>wlz%yWZjL-vrtkyYAab8T6bay=K8bFp-gyI1xFaG_j*hs z*Hufu5fqAA_9x!E2}&sZvA5T84L4&_W;yP>W=OZ-x}5MlLEPGZrkHq)Q6(_4VF{Wf zZwFFj^e*+{jvjArekXVKxK0G7O2A3Q>T*Sq@w+kdvSWt<`5w>7z^*!4_1-{6p5|71 zA1E#*XoBMB`{8cwFgx>qr&_e@vLDEsa=woBfhqwfr%i`%ZeI`K#JlXuHB<@DNtK0d z+hwQ|eF)An62d+>DYN=amjF|~bg{Yxpqkg?NP?MvqU82uKA3-TV!k}bo&qDM>Wi#H zz>%|#X-18CChtpfGmSsn!^`UEr007$Fe|40pfaD5k{3LtT^JONr8;b(9l$Djmo8Ly z@*xjhc=r9qy0jD5r!>nFOr~D>P3U~{9;Sf-Qz4zw+|35nHZDf*V zN6FlRZQvcOTsZ!P{@^*`*6DV&*iL`IrAeDf(Qzf`eN>XOt{oQqK`{MSFQ&u!u>ZvF zCH<@l{Shiz+F-IIxgR6c1bRT*HSkIQJC&&F^QVZ!v2;)AC!fKx#{-b$KL;YRp;lU~ z444Y899!w>U-n;bV4kExz{$i)zDZ_u)+kWj^pfYJ+%6np)vEqCeodpeikn!SPd7R~ z(zY6Y>lZOOh+(o!XWxOz*ba6vqW>O|?qZvWe}MWX<6e?7LH>wKZ>p9Fr8CkT_IK?= zT69&m!=dDVIsJli0avT|`_j0M1f)R$wOQa%U?QQhbbI8E#`Sfuwy?K4cnmhF*LAd)7bHpe&t6Uj8wan-jj11I`LO)`z5V&u8G=$-62 zS>gJp`kVqwecLWULpc>p>N|HWxjzk;bPB%A^@|YcDaz}S8dG}d=}0M!yU7{-C%a-Q zJrf}sU4ct}*(g9cak|OqbY{ZmU=n7{P&uyFe0XubVslbaXd9r?Y!7yqGex%k^vi+1%7eBm=g!69Y6%I*@+~o z{H2J5w`7tSrF1DK;bm1;ce^}amG4Pin8r+jNXo8uG~V9fT^ji+oMLL{B&_zV2OQT8 z4R};aQXjMIS~&IiP3PgQgA!RCsWnr*!1{bZzI%taYT`pZo*CY($Tq;qg46G7IpCU} z7&TaX<649$G=Ul`HbTm4S$C!3b)cmEf+xWB9;;&QmikSeg-X4u_DZZH2q~iy_sfK1 zsP4AdKe*b@2mi=U;kU?PaPZ0wX2A|4(x}t0_(f%IM|g$Fu{i2y5xs zNFFWi`Q5zpds3XIBmC&P}h4ydH zqHu&>Qv3fO^Bln&CaV54N3zGU>7CXz)#^#O>o&^8@+q+Tn8}(rJ>9>{cdoPzIH{Pe zBM{gv@^;PlX&V5+4c8@6lnqF@PEl*Cd@lqx3CYg^Fd;3wazpu5zKBS&%$e?^Y`~Fo zb;LC=zudp^)@^UyN$cqGz-Z}O2VoNVg#_Q}!?NDs%Xj;*7Eb?r z53|s@*KwiEV)Q;zDlT=Z;6bp|+t=dr*+b_U>dGraId4H+iPacS0`Z1}lBe}Me zEB+rH_7yVa$cM4ifv*usSazp;11H_v%kSpf9yX_e?*i9UosqKdK}C4h&#kWi;8_O7 zodZRM0E9aKM6=Ub)OaB2zQxp2fgTQ4&C7v7M?fjwg=y)Lm?X+ZLUr>fIEA@obgM;? zD&V7W)#+|Gb{^wd+-LpjFE&ycfRysp<2Q`EkG1VQKMtFGwe51pgNY&L-4kG;HOtwF zo~2INT*fMTQhu@Dh8@DQKv+jxUvo-cylS#pEl!1!4a3F!Z%N@ajDi~)q961>md&iv ze>z5tTjw9Kg5f#&YqQ>esw#aZPAbY;Ljq^_->>Qp)j0)5rFI0VFBc&QZkd^T z8=WqO1>6+dzoi%C=juvZ&v~fQN#X4gJ}=6;y$| z+y;#{rd)&Q2AX$s==m&PxJxR-4F3Vn8Xp8i?eWusbh&{*!lMM^8$mF=W( zFX|^F#G!wz`mXC&Q1|(z>TfNH-0xYExKLU>p(=n~?Xcx_y!nhjh+B|UJAUjTL=w7n zkH+Z3vT!}zzt-Ft=nRMmY;t~|H0}#_QyiP94&pJGyw=ep9{1Qve1{T$vj6S|E)Dbr zI1;E?NKLVx#w10Pqr^mOK*$9>pe;o_8>mSS&Z^GOgB5x^%xgc8%27>JJLz1rUce{k znS@mV9l#~#t0trUy@*K(?x;M!1SX3dk+tuaVUk_+OM22Pp2cc)THX%=T^{CGLUdfO zVp3wm6AqLtE97hWV$ZHR&+v6XP`^4$P;Yo%C;_LOQyvfrZvKh7hKTZjNlF~Oq+H+b z->`1!zI#doMtZD#3gF!U3JW#hJs?@Pt<-W(CGY*7n{>77baPt#04GUPbxQAtp2fF* z)X6f{oIZk+of$QrWZ5J7AU1EEf7H2(isBPo5^hJ^Xx*6jDMr(^3%!#48I){rZ&BAj z?{RIYqCUV$s=l7Fu!Qj?j&vw-9QCVye}za|>_Uwd^=p*OANsGA-){hgSohx?a*F)7 z`K`_bVSLDMs?NH)D)4(y>ITa>mBJ9zt=RksB`H@};s5nZ+6tyYcUn0(UAY1Lql?LRTquhoj<@`4Q1 zWXggD9En}tRd*{IxIZvB{S)In{u47keTze}wJ;L`)8esigMX9U`!+cp`U36hUq z>>1kREp!Y-lD3Qoco{Ix!RS;rn&U3^nE3S?%>YR}=OV?&Nak{SN(NYUCu?WYb3Q`) zxY?&;y&!n`_{cyr$6pAGad~&IrWh~6q)Ql?qS##wY9vdmzFd-zb1oKqY5(e`aSK#S z^RvuuYS+FkhgBAM@G6=GBp^%ilnT!&M5~ZYTvgDkajNuAt{oJM0IT>?v=&JHue&0e z*ZMkCYSx-@JG|)@2&uMN&UP2dMIbr+u2wHLfWo#sq{yxTBxJ5uBCpMRlDPBtw7`vd zZ={8B9gt{;cGbZun(Otbq`T(!#viS36Hahv)zup#KmGn#=Kj=(J{YxVS6iRhib|zf zW3aS{(SPQS!oO5g)zS%^46WqHRT-KD#WGO?>kiN4YI&o445stTSdAOA`G`K*_JHpM zlUxTIm1;%IyK#xTwuM^C$mh*B`o{kKS`CXo-c2~^xV)|*P!>0XDbq7AWl(_Bo+Zj(Y{aZ2W>y^~@+dOu{y0=0wN#lRPMk2U?%)-WSS(Uu5I0-_&GlUqD^8 zLQUvHsT;-0Z{u+cnNH=3m?xnw8xNX|IpUDY&~t4L&}P3+_Z&0hxFZ0R&NDa?VGsD* zvSdBm!R}tV=fMcj;8jzL{fLrv^Z$MUP{XFm4s^h?N}`j?Wlb*zle9{twY>zV+~OdQ zdby{}DWe{~f{NC7)#XE_Z^Wp25Uq-w^~aP3kEuH?i9904lz4XdWW$%m1_8C?=`G`8svFD?i%95y@pqubOR1`UtFC?%rcuIns0DU*7yNj)Gek zQ5w|I0VdF*ItGXUM9t?yb2}DQshemPg5#hR{SCBsb?*3lo(*p&07*M-t#x-N_Kbbk zcA?T9F#pTZ$7>2)m*T0}Z2Yo*vY%3$Yl-WWp!p{z>o}ZKq4ZVFf9*6dG0kkAf4u7P zB8;T3<SsoZJ_hjk=CaGK9*3Fi};=C6vqcax@z6j36R=3qySL+XQ_KndDP~tXm`;^4>iE+=#8p6bn?>wrU~prP^CV zwZNvHZakf4(SO!XX{fCyJMzNXoT@wxrI>cKgX(AD1bTV($_|ICT2=NBKYch| z=OFBblip@3xv~FwS;IHMBxO>}sl@YUyWsvCGmkoSCnjkcC{K;Mz?6778n$HJoo~90Q>8!L z(}%+GuZ_F+Vmdjuli{Mx--nHym(~IXBqj-K73G2clT8+^N85RkmND|lUR%<28V^B{ zyhVOlc^FRhHnA;}!=rhjj;Lwf+dh~uayjv1K(cm44Gxck$@@$>bt|$b^EM;223L@;prJdunZczvr$U&Fq+e zyyn+0^kEEc-s(y-?pV|gB+Z~U0lb*U!IhK^k21wG&DirYK&HG1%01^5NEX-Gd{$O~ zgP=6Hj4r0rSK-84=hZZ^zlMrV!r7C%P-kc}Sz#Vd(SRe3&1_jVvy{5@X8%&1 z67ffe9qM1&KF;YqMesJVCTu@rq3|f->snF08-9&!z89X@Ri^iW6wBO>+81aQ1EZAf z)hH!ad;Ktk#9dh$40QsLdU5q|_1ho!FL1OjL4JZDC7n`f(VzA|cL%3@hA0ipr1Eo* z>7JTB&3-3p1TJEmoU;~QqwJTc)TO00&VB_7->6fueGMfGtIF@2q`tuj_41!itDr^n zLF|}+juHWOD%P_GG``a&Ycl&iK6QL@Zfd7G_(T8Qva(?O2qv=f+b=Kk!~V;HGz^QB z<-=i#(+8HP%@I&C(>xz)js8efqFCa3HN;Vf9Fnqx-NV>fr^HlCKRO>b(|WsEh>C$y zug|*v9J+;PX{qgh%~PDUA|EKK|A(ymkMVTc_e3uuBI1gOaYbBljAJ)r_Yn~hBoQW803ZuB!f3s%omM`q#C^IJWB;*RhQlV_e5JJr1tzIJWH=V{B1~h)5F= z5fKp)5fKp)5s~})toKuoJ()`V@x0&P=ULBMpY`kWeAZ`umfcv{#(r#12}S9sW62(e zS`s>S=b!7#=y+TtyrRryCxEHq-OZ43BJ8!Bs)3y3IwDx*7_6#+lQD|ywbf2f32GkX z6bmRXb3%`A{+bp~>+i3_9Zv7>Fb8#W>oQzA05iuzH@uvQjNnFQcJ5O*XCbt3W_QhO zdH<{KP+y~GBl70*bk9U~PM)AWsLOMK3@;Uf#c12Q=y}+5(Zy%VK-z)@SqDMqn6M1DLKnoH^+hVV_0?R#N0 z7~wDb_{Ik)8ZhAxmo~f3bqcksDS$OvtncsNy1k;}Z-n!Ln$AmA+k{E&ZfUzVKnXhS zn3CZ-9pO)G>uPZ`UM5y*w(WcGN?3WAoy-hZ0n(e|Hc?9M)u2Yc=T8FoUV}K=YiK#HJv-*uY9F+lEOkSUuWh5Zr-~rd9Zo#1zy? z$22f5q!tAOr?Bl?U^b7_Me0xk&3>+bz!652iE<#A$8}?L?NV8CH`$$Mcl8Gj??t3* zcb_g)3{>R2reu3PDF3VWwkY^D7T$nK)n2xx`)q(9NJ>!P@RU zZFN`pTI19`)C@o5#j_LLIpnvZ6w=OUT88Vq!zC;K^GsXkI=i25xz*1ZHL?4n3Qf=>D%}8PmONX1${Pn>Rx2`^50Zm?}JrXMWLq< z8CmW}82m3Q{lcZfGj7S(gZ+a#6kX~ecxeQn=Oh6A0c4uV>hiYj=Ddfo3V+65^SFM* zwLHqp)>2#_15?@xfo6|1`88~#W zvS`ezM*k9&n%Fu+&6mfQ^SeC$%Zr#X^61J0mju!{d@?Xyc=QS56x zeiN1A)H1=$bu~#G%tJeByBre+BzMdXs^%RaRnuZ8M)7zTsc2_jOTlnSr25YM%$EAH zc^{!T%g#9C46Ys@K&s2eWx9py2yM0BfZib-1;c zt!lgHo3{H6E_ttDtTYUGsU-=uLB~KO>kN)m3$U8$S^CZo{j(NsJo_VD@*S@U_K*h} z7@aH6fI|U^P<`8%*kPcT@*v2%*5Scg!q`H^|2rbLa|muay^?^thOQ^^I0tY&kY{ZeHl@m1pB5%Nti8ItR#$ zS4>-^v7g3FaV}D3DQ%Ud?gaTP_A7F`9gRb_^g$bAXOB=#tzQpVu{3=b@<<0euaS0=1^l>TDgZMim2} zcdmz0rw8wgL1QC^tn1UalOe-Ztz8*&`}$UURgxN>;A_#g-CvD}4U^P(wBs^*yk`(922jtHHP^<0fw1sQ@at$)at$4O1n2yr+>7gZ}y{-u)P}1dHs5f7jv>G zE7c8%1;U5eUt|PH+Wk|fOzcmrt zZigd~irFLkcjVdi_3WLlWg>@+-<3NBfu0+0lDHd@wpOPV2(F~D+=EL_Xv2zeFRW0j zE%ZtIz9gajw=W)A0V0`eVGre+_(q)&QIR3{$!G?mA&$-=3po1GXZPv$4N zzd}3$Is%+QwkEBnz_!)br}1gjmu}amnvW$)#^y>*e>T@EHf^F9xK87~@XzT6kgSx` zV$Jq1fRW{(5Ja>CSZx@%DJU-mm`&y6qI^c~eyCUhm z0i_CURGl!Ys-!n@^17v(|G`}EE4#znuG3Ox^h`It(?2}x=pAt-<@sG)nzd=U*+Spz z?;34`M2&;*SF4r$EN9iD@x|CW{i11&RTfG-<&#ZPldp9a7O3ACdm|u9MkKrj5nlRtNe4 z7XfZzlB^GzA2DeoXD+8C{Qc7Jv(7%L)we@25#a8b8lVn?z2UNoadmkA+lsdB=mnok|(5G-ovzcEB6a9(`Aa<4O5{$!BwZW^Qt`sk{2ubB5 z{|7FYcqRUywkutWi}q39eme3(=QKIyhg29x_v@5k9FaD{LP>kb@7X_5)5*5{-r|Rr z?f?=1wSk+T5>3Gh>=-&yn+D|l=NI`Hoz0CJ@E+ZQ$^hp*qRB8cfZbSAOR!cld?oBg zN^HBWkGv&-bomWtvi(6kC~I}-dMMREVbNx8$gg(L)SPr<|7+s%-qd4JB>odkA-J>~ z=e^Jr;MB@c9gR{x=(nQMinel@joN=3u4~$io#iuhJ4Tt9i<@oYj-*pTXllN#dF0OA zp^yDdJ#bfG%~93x?*_XRpRXl!sskp~BWgwU`+Iw0MSa9jA9{k0T`sEkBeeaC{(?~< zzZplKjQRS({^pUn<{0`ABDK&4o&!A`uW;=TgQ*mC<|Asdw8ck~1_Qz9JaYd$Ec1{;@X_?M4yh<=M>)-Mea#t>T^KfcnXq{Rh={B(_qqEUAyk|i)V6!@vv?A zdlpQ;uTi$_In{Zd!x9(sbsgyWA88MWwEK1&n@6eZD&hq{c>9i5e!KWPS_Bj^tgrJ= zUjkFxHh@eEV&{RE`&(@ErcQvG_Wp|H@|x?kKK5a)@AcferMz~?=M7MW{MpZ_5&fOD zNI{i=lf5n^?9zFuZzqYB+)iu5>N}toS-ryAOAGP!E=u39?4#W80TI^iaI~Kmulf!S;qC)PhCc4 zYsz5vS$?@>qlFBYNwMjA6bVGvLVlCAzAy9eTFOXmul@>*z$fOMGvB7*uW`x7?shou zH}FzpoEtB5j#~K^nOCjdxN+4GRU#?14~?bWmEYs?`tij_ca*yM1BUQcyMrzPbmM4w zSv+0tQMDd<$b(BS^rj^F(nO9# zBs06};9a*Ig;DtCdQzT%RZU0tk8#%?Gi4ov$e1L0`Bru*{t~EUWkXflaY@ElLSra* zRrB`oxV&s`LV$Sf;slJ!+*0G>iTx987n`qtdPaU11cQ!|r)w>mpekFZV91T<5mB7l zKVD9mHV%9IG+gB2>nfn|^dALW(}xJbf0RM2nA3?<>pt^8V&a z1$r?)o}HUDWw}2G=p4TCouSf{+oXK1Uotk(cPkiV8-&ioMuL25?FUet^HIs|#uhUD z0@$FiKHI}Cgd*QlufC9`(Zg-EU~(}iX)p zHBhIsl-arVE19;}VTs0w(*@pLkI>h4jZrIH%BU!F1g$k}f|Iq~b?*02epdB-Excd^ z?)uyBEPH4On=#3tkHV(!ut+2k&E4!kxd(ge9YLuECxiLU$BS$mOp2T_9;Hk4Hxrthr{E+gC}qXh zRwMsP)c4B#k5T2r7^vtkNnd%dJF>0;t z7T@nWBTkvM$Mqkr{(59Gz=q8GohWWVrFQFF=NqAukNn>Wj_ChD=dFNZDq*R0-sJa) z#L|Xtd^7Bg3pJJA(x-Hp{N9>h2Ik4)wp_PJUAMbVIm*skqr)ATNQp$@C0K`y%>|6ZrhvCT883Q#Q(KJvbb5Zra$6U)-DOTl9 z`8e3MXGdij>~#Ala1lbSUd=BELcuSh!Asn}-Y&u*UdiZaCv#Depno zNvn?D^2FN+1um!RZXNj@OlLb{?9JCEA>W>_&UgC+Mt6@^1APzCHMGvWQMv|_jJH_< zrC^eFUd@4OO_QkHKWn?(Uk?KI!i_Rcn8r!jhiZ^YX8$7y;MK z)(6Aq{hbGIsb!)sf;CfTf|!ZFgqK=FeRa2Gim#HwXj{Vj8c1GNjVr}Bpt}6)I?D9h z{--t7MZe2q?6!*ZeSnT;So{IVP;S|rg1mi8Ic@!zTQ;>26NmhEsgd#QZ{c_k#VFL` zfqBElVSrrJ0fF^>dpMW|xG>#0d>ny_knF82_xvOKd$c5B=4I-rq*9x(%Cqihu*+7h zjF(@S5*~x~UR%G_=*d9Jv5twj#Wp)7zw=4_JIVu)*ELPfpm7=|U3vp!g_0@h>9`bq%(ACsm-RO( zxNp6h63)bFas0(=PCpAuy&FU9E^Y^;FGqG&z2J$nd$_B%+`zDAyAU%$Xy(rgu;i@<M zYx27xr0fvnaxEey8QNFV`54?wg>zc7F^=~)8G=>KBe-DF`r0+XYQ< zE!@^rf5%GEZm0*j!{ocVzT7mcSxl;avZ{U#B6V3IcYijqk20_Gtte*x6XFnDs%=5`mMk#RGgu6!}?Deb98Y*yn(>`LZUl zKjdDS+*qOWUg&?Goq2`*=S4W9pie=e511Zu$M)&n+~gJeWmt3BHK2H}y7s04Esk2y zKRUKh3-eS1OftX8mJ%6#BhPNDNRn^5*1lKHi1|!0fYLBym#^{R?cBY8pM1XKI_X5$HYj zkv2P@qX9lfMS`o^KG;v-ymz;7&g%11MDo7QvY^JGnm)r-SN@x7IG+RFp}KryPz?I} zSN7*geVJPr##)$}uL9Tdi>hpY4a$OHZmel;-vq0~8h2<1aEe7^`rEgh@An-hnY?0R zt_&LABO*Xw&LteG0xI${zgZM*k%@mq%D6KRJaouIOD(>+93Kj%pj~`WSvw3D=@_qN z<#0F#BsDv^eQO_e^N2pJv1&#~=D99#8+`$mVGq$`t2Z5uNH1Y1tCl`rgWfS%t?$qN z?9V6+d4v?ycDZI8he-YeJ;K%3(eb&dY|IsFf!5I9WI2?>PSxOvIE`dv#lZSi)P_7F zU5o3bV4a+weW%Qrk0V6UYvOLPEEOLI;}`|KXX-sxtFDUu1N2E9v^|J-%*x29WjQ2`*Q-9B%2?4LljycrdBlgaO z5n!ahXQHB9RWsV$Q+u{;cMzR%GbmJ5ni#S4*0skwbGcS%GP#co|ARG`X(Uh*W^o2fYVTS*3f?o=yj+09K6+~M7Zph zsBeQJk`;9pQKfr(?%nCj@(!1>#-~{k?hL5#F;oK}g8jwMDF#qdNnBU0;U1KHZ?C3( zuj^!Hq`ngF12b@Bv9e+jQVa6HLY*D*fNN5y=HTl{wLX|1g_L6xa4l2AZ7t&fkXmtg z53Q&M>-tfWpq5Ma)jfI~2)PL4*>>W!>B6Z+K zCGlLJ#ERyD@;uzNSSHoBsqF<+y7OgQn$7D)L`2Yhnzv6H^F#%sC1eNrnY%>WU_QX~SiQMKr}i@o6&P3bR}(-2&yz?<53`h!VDZ|e?% zdK;l8?ENUC*E{{~(_)lxa)8rgvanA`xJjArob^rW)jttbw%Y{PWzQV5=u; zspJbdBBygL#82`|9EqAvTbz=w0zI;>K&Df&_xF7V40B z`ayn{QC=OTs~nE-xWMOsS5}fEa(B6hD*2K9-KNvh3y;d(9v`m{iE*^6qe=$QWZ zoudYqV?mwQrN+C!;UycV!JVPg6kBdiZOl16kNnjtEozx-A}hxYg*-C} z*r+t!Ok8Kd>Zp$U-Op%5D$Dci&hq{`+qELs_A7cp9wn|_&9CEJSi+QxnhVc^Qk9oh zC<{tKf4eiZ-EaY%*5KF*--omVjP5w;MHjhL#v+FK-2RTnVK9tuxkvZ)AT}pN_DI&)0#NSNDuy@hxKWHJDWJ zE<-$}08ZvE{B!;tV_=e1{nlQp!p3plo4ARS0b=vZ#I}G{duNT$I{;#AAD!0QX$6SP zWrEKvjMc}YYMaI;1H;X^X|~4#IcR}%J+7r{+5wy#N%_CmEI~uS$)~z|FSWY7%5KOL&Ag`<-`L+#_H~*Um7z~$wXX{*1DqzZVS6)a+=9?V zwADZ7jd5%LbNRRGQnw+Jrwz5pe|w+|`CO(tfGLt_t~H;^)17@1+{{WXxeJkIGCtv> zL&@$=Dty9xsnH@pO?rDBPkL|v^r)3p8MqITSFo(sZgzsY9~VgoM6kbE+#f(yQ|7}$ zk?4OnROAEmq5ST1S_K~fQghB+pt?K^S2rOw=n+VEvNytY#IEgrsd9u{@cm4{%D)!qNtor*j91+XPA)&%i0+rcvRi)EmtKCr1v|Z3;tw zM{@^iedu{yI%vfU)Bj8MFXS-=sjr+D`^Sb$#cHlIFZCQXKrPl3^)gO@r)nYpRo5wB z#d~eZyoQRj#&~_;rS$onDI*TQ?7*FFeQDQ9oZhlsq*ZX4MvYw<_8RKc)D zT@C1Cj8c|Mo8`A=O8W$*5I$jQ`T5iS5zCeBE9WzeY%RZ-A;Dz?a>e5Fn$s6>`p9G@ z{$>BHW&C#b(pUWrmn=R*O#qdCp=QT#TzAde;$}?l-{MlW?ck7v?cQg&Z?c|NkJsp8obXAT71gq3Y`H;rB`J0+Fw8f=ORlu)l$xL9kH&mGD>TJy>E(% zsHAm4ZXK!-_rm-nr_C|>A}}S_ADTt!VocXe{;AfYnY%<@uId=@OR`;R&~;7A9mGWv zn-ZpXH9}T4+soKYscVA0@2&q>-c{>hgYnR1myOUQ0QqG9f3{w*5lm( z+y1N}Ok}Z3Sf_3|X9Ou1+YB1De}YW`(S2doQpfKVLsP!8PmRTby^%73>4t2_GBkU_ z)hMm?&wjzQ;8La2MM`VD7E-do%et7CV{pV#+h1d_8^?8;-)hd_;q}!DUwwhQRtbzY&v9WP?WA_LyMt zH{m5WQy&sH=X%F3-SrmNGCeR=LCS9hx?aCft@}2(vPe~cZVwu+EkD!-NQQ@+J*OtP zJ448%Z_v^Z?m|QaD{D@?WreYQUL@6A2qqI#5Wdtd*^Sh?=q z56X@=46a~$fMj69#Pq`2=4|m`e>Zy?>q8GkkDxDrDO&<*JeJY%77rDGtOGZ z?BLp)?5L?~e>|v6aL^MFiLJkE<)xMyfK&kGx;aZcm7kposo6fAC*}AOz33SzS=eB| z9R&eSqhQ{gp(-&epehr;`jzzs*IuMoGkm|0A1=CpVdzEI)f;bp>o>J>_7a%3%$6)* zFG*IeUq*LrpMk`@ zZES|>^m)K^^QibDH>$n;RX1Pex1Fpj|KgCZ0=7>q{yk*?=reRv4aJlKT-x`lnr~Ok z)ztniG7?@}R_X70xOqT}{vM1Fhw1%<_yZz^;>~D+{t-?GSiRc(|6iAy&(P>Tdb&zF z6jddysCl>00YELI#;@kb)$M0H9G&cr*}6)dma`*pX)Rk8pKfOeADM@hzJ05g7jE5n z6uN69rFR|;b`>9U$}vABWYwYuhGQ|wb)7VN9F#I`*WS%FMzHuI$0{Z^H5e!hyUX)sHJD26Vt=ebc>QRdWtvNT$z;!>oBP?5s^8u~|a`x)0%Wb@5H-mz<+eL9ZgrWD}P zLMaNA1xi64+1_^TT-`rnif2o$wtCG^k5s&k|A%gXNy)Cu?*B1Fa&bvnyvIRpbu_;1 zlTcc~5J}Sq`lqeNRi<|!(#d_;W=AN^07+OsD=&a)*V1Eds86O@DCs#ZsI-FaR~w*x zG|fYi(H2IvX5ri2Q$tH9;O#|4G!;X{-jKhcAK-{uKPC6;fxKiVjg^i7r~B1P-;GdO zWZ6m5J6IU7DMwoqjziVWd9aSPQubT=WVYy}w?fIq?(JgXG%NgVIHF`jm|y%Ibs-Ow z|FGJ-BR^_kgSEZ=PEd};d^Hc=6`-Y((cQUmY2o`GaA|@~ps0H>s<62}@DHRVAm~F= z1bTmd^A?7ZT48(uK^QBoBsZJ@}wq?lUvj{ofl;~2=1+@iOIs>Hb?JR%w7xIfKxa#jkNVZjZGjzR_ z8z;&y;bqqZSXPx+bC)@_nUr4(B+oUq7WR4%*>fofpnR_$sw2nV1gcsp{8{@bJQ%{W zOomm=w|n9W=PsJg-a&MOf)SaM07U^Mmlj`IfXDa1r4CaQ;QM_NEB7-_QVBE`J0GVaxW4EP2SoOOG1_ z*yd}NJ{0GrS#PB}^tbThu~DTyPY=iC+ky{aRQBeYa|Aw>Fg!nDv6v+RL#nm=P^BIP zdAF14Hc=P=B3@r@@)*}?>%;S-Bg3eje1#&VCg2okp~5zt=~`u)n$3cy3`IH%l>ygeTmJdYD`h!0O~rog3EPXyV0AW9 z>3z7h&-imdVjEQHl!TuiupZ3MtH#RnkZH`Kim3DR`!e zyV{aS4HOq)RKrAhuI#F%t&0%^`%9l`11?h|!xeUYu!k$_!*n%B?9JuJQUsLh+@epA z_BsU7{IaI>^?7pn^1tv;I{-;af8dTBzwb?`>Zi|_p=c<;+KGlY5>Oi<=>|YPcWM!M z&Lq~fuPf2sHM&E)y2`a|iOEu11FnXY=C6h+1TMXTNkQ(e1=91Z>00zbjrCM#d884b z$hfSeZI+vayJqqCYO^_KZo_r8>2xBxc#Wz%kV}izTxN1$0DdY7YO4ENKzI1p)NotMT3fd*da|BHco` z+=?OhOH2;8<>s0TwpT#s+u_t<>5IM?GpyZ#_F7J^f2f+ivwtcZ8t>}y>b0v44|l`q zT82)lZdL>D!FugAKW9#G87T-iBx*EWp=XRZJdvpkn8;|Q3jL+xV8G9w3o+G+Uj_$2TwqGom~v| zMfYTWZhPNp3jNRaGM26DX+$b&Ypv!z1A6yJIio)7I)&dbBY1~y{TwDm9ci2Yp3fbF zKkXQJ0izL!D5Mv?7(gQ@%9nt2mSKyFZS;8=l~zIgW4``W?5nt{!TO5-I~E2od0^me z0ft`3BoE~zOo!<4*h0AlzX{91;-j?|da(a-PkZCr`O(*zihJize}vzKBZ{e7vUo3W zVOJc<-v=YuagtNI4-g8Z3pbN7Y3;2?Zm*%UwTAK=DHkLcl4IpTi_s=)EF8JdG@En~EHC9^R= zDS${XRXyM2=NW!D(6>-mKN|Mbgu3`H)LS*lnc!+5`MyuYNrkm(8jv|{6jx${=ZB~}(2D7Dsy_}O z!r8>?I6i29RWR)VBI?V^P2@z^T_!5*PtDsW^;E)sE6~ZPRAvH&Qzych>l5;bnE{hd z(e^^OW_w!yoZ)-%w=#Kpf0xe?_WA25Wy^4}=a-vRnW~z%^i#A7zob#I!IA8G(>T5Ed}d0f5Td(VVsYU!mf#Ov$+6B zIpuD$#<2@A$y>Kc(CT@H?2G)Ol6N+bM4AJbtXePe-~gNaDd^qOC{DQrDHwKgy@zsE7K)C!bxOqvH79e~uuPi=b$ zmj-X4!bZkcK_kfIdV{q^Dn@@3O|G4urxRR>Oi`M`*iN~mYG9N1!Bqp7UqajH)7rLB zcJgcB2-6XZj(encV3r0d6V4cDzFIjz*KjHKdz!It66i|JmAR_PSoJg`1U@BLS>J>- zK{?)mObLWRQ1Mf+hVMMn8f2%TrAFzzlNOe878l{!KNbh?xg^rI%hfnA56g~73r3UO zfD~=@EN7>^{nHhdwZ(_skIGA0P_Qjk{nz6<$GSf%!3)ap&rJzdjvM@zHpLsR9sV{! zZ>mY;M!!U4JQP|5-Gqsh#aJ#w>dlDwMlT)yYMigAf4BG{!W}Nx_*=oqWyJYW6cJeG z7_Cqcx98Tj{n?Xu1eU*UTh0KLcUv`VCW`!MVK`_RP6k|QJc11ZsLO^x&pghFp;`{9IKtBDU{6vgL&?+=Ox*ealfe>(uB`YzK# zo2B$&)c@3Ay1hD&B%O+tUIlv$l-pXJ++IF6kHfOXwLrUh0!Xsv^zwD0f#h*zUB7lC z6(o;yPmI0yX*kWXEIpQ5YUqCknbHl6?yHwRn?&kpygC*7x%{p{)TTBI1kWQ<Ll)O5t7xWworP$gOazh@c9;&x9@SJ zGd5v*`@wZyJLZUM`UxCyTv002A&)GjzJmRn;&dn?zb~up#fO0&ubunNTyc2+vn`ia zKETc-P-B!%(;I0_j7Ro2bKzjhNLn(!r@4(B-D9UG5c4tp?Zfqbd@PuDGcr>{vD_Yq ziY&T=XpZlR6+VMBfD?LRT^)%=JL!o%+b4FKpie?1*$PM7wyvMt-y>{Jjrpg9SlUlV zGYLU+qrsJOei|ZT9iEu-ZGSo(u{p??KGOd@I5DA(o(UT##CQSD0#sm^nKH3du<;A>D7jNe zD;U#-KRwD5gAF71I~=~bl3fgDRH_|srIZ?#29QeNJOd950%@gN1v)G*^3^>fsCWye zxCRr6TvwCHIxtnYE^+bKBU0{3`$XD6vN4ZX&t<}=g7kNYz`A{6k6aA(sc_^kMSdiY zDBC3!xOj8_Nb~rr;q*#WmZj2Nw_@LYOLXo~OkG0Qsf1Y3b zox1YI>%j1iZ*@dQ| z-PZold#N1#^ZIl7klc;vhwv@DVZG!YT)AV?WS#+?x=821{I4tPPtrg95b0P` zoNo@w51{0)x%ey9gZ&+b_1?-xmT3X-o-$_UD<-@2^@ ztp>b@CO`*qoknMsfJk1369~duZzAiIfvaqbulBc&URT?YUhDCO*4kf(QzO)r1-tzi zM^Ik7hYCRl37}(=<3XU>lEvMMKoU2$I9+p5jqDh!%*vEra?3PJ+u#e-ouxGcdzVxk0K-yaNE>D13`F| z-7w7%^kaVK>^YZBI)^;El*X{F+sy;{P*hsR{^r+z7@XV+LBtx8C)i5(z-GhJ5&a!g zwL#-ZP+>*8u9`UtlH7>i*$yf^8rC#ShHChj+$w5S*`SVv@}|MTAqq#JHHCZ1ukUy; z^*KBv+}NYd*W(0aWT0Cbp2*lluE=lxY6=C-;b4`GW&Zr!wS79_JTCwfsOu{9~qML<`>O@F!=PJ*j)0_9ZOk~EMM7Ju7ZL^YBTx`VNv`8hCkEB7ZD91C(2b|YhlhOI2M1qt3Dm?gvk4hnj`OWEtwj@ z%E)^+s101Jncw3wvR=*QQOCMBzg_SzX%sFceq9SOc|RbzE!DjqaGmlr8*r`vJ(!=@ zRPqnG&X7D|@4pXrL-qk&Drab>BF;YCkHg-#f}=<6 zc&rblCvv|`8)2UW6nv}kOwW1>N;Nl^V$9PR#iu~GZD`Mc$;)Khd;TmO(d&IBN}n;b z25btsMM&s&_WARu6s#FDTA0=sa1o$*0JWI>BErkeRm1ho`4X5mI=qq60Oi#cSDdEM zKRv1F%Jmu|)~@nX_!Cpd>wX}hT{C4SeFNwklp)(hO=mzw$?07~Yp(IjLBHgkLQ}}m z+prvor&qJhI~}ScY$y6Be5Z@Y6nl%UYQu37ybQC zA0pH*;h%;JD!?D!`cLIM^%Xvu_Z7_4R0{pqxMa@aQPVZP>6xWMe~U>K?xzQSm*49H zXYVFvSvmwpajZdWYd`dl@0r^_@ydT@!@zX%Z(fauJhn6~i-s`HxvX@vLy@`Zt^d37 zdKj$CK8l+Y!Qr4+#HsLz{@z-RpgzDU*VJgu%uR{F$@9J_q(?(K@7lptGzixz@$9~> ztx}K0q&00)fo-GCaj2*cTPlw4#&+NFNoAl^wi8^-kk$vB2&fT5Qkm-wODE;d%gW?) zvg_ndP(?jyUJadsOY+v%mFOafBw38_G$eZdeYI%oS@^4Uxv#P~>h!S5eM^)40|xv?wCyB8YB(t;L_`Is?Tn zqeQLRR)Xi}j^SE)xxjVGP@#`nYrZf)^Yy1khAIB-3X<8J}YY~-h^2CWY_+g z$Wt3bc;z15V!R*8wLNug7Os;Gp;2q~&nvIQdBrZ@K?bL*pvYjD0GfmP)u^s@|Hrp} zZKVXR9@@GgMPzloz!P{=p+`?wTw8cS2@?Fsa=8!TdUa1G>fmC~$NVq_> zkz;rNocB|D+m_qC$drAJ(12|KUMDBH}fnB<`5f%%HJdK)V8Hmq1lIJp%ls_aNlAY_Td2Q|ZW z-eq6$R9lgL3Y7tBs#g4;&fQnkN9r@Klhi7nJetw7h%O6toOi88)JXaqI$2nJrj9#$ z9!`~-9-3q73m7kE6oUqH1eiAs&+n@#@v{)kB+$afex3(dZJMFw3(%`)re<}@FM$;6s%HKF zs>gK%C%pldwZZa-{U+B773$|(*Akf?ty5XP13E?IfO(w0SYIv_27F|)x;}M(0J~|W z>3sXzyzwJ8?SMxa_cp5^|KC!c24?Dm?NHdeY{Zt~!vHnMkWrS*!~0v;IHTRl2i%Pl zR-9|rIub)X6xq%r@~9+G2MTBxM<;=b$lh{uOnzmaD~rLgK}22Vs^cKV`FZhmX{B8cWc(N8ib;xjT?SEl?`e>e%3dgJa_3sWceH;IThD=dYZPQelX8E0I; z@ZdVqZ1ekRU2`3=d04qBMT+pxhx!*KNYVu;FM8|O z)q^h#5G+UMy$DF%)zDq0-it9x+&s&ys15|GjTXjeu!qxSw_FXTlsw%_o@)>hdhHj^ zdd)hV47Jwa!?V_m*7whCaJo2AZbT5}Se;u@i|U*5jMo~FXB;AF!J|v24jgmtNF_6Z zlGnFx*7SYQZcZW|GPg_gO4!I$=cHZL|0Fm*Azck8pNT`wcWh!*_#aU%FX;Ug!N*6=#yA8Q8rJC113#oe00AlcdEzN zwXM6;Jr-D)JZJk9#%(RCCNPIl+1&{V^H9cp3+=Q0F)zE(sh3Tot$Xi95XE1Z{Pw#{ zF*l6L!S#VPYg91S;dftmO#(WVW`7Ki%KB-r@l}5rqL2y@jEFYZs~W zS)^>P+DuV!>AlV|W?`EwwO0z>1`st<}a~%9AVWgXd+}WYwH=)by)h zMBW0i(+~2yC#M$=&<}dJzYe8)1I&Aer)H=`Z|3I(3x_oc9R$-KE6!bgo4<`o>D%^& zE9_o#(0!=?XZR8RvoC!tZ_vr!0)5 zRZ|}!RHg4FLiiX+-gea@gXcfNL=-Im@TXw1+BwTL3*2Y@V=GMp)!IJCB=gpV7|=^^ z{36c^FuE%HFI%L~FR`gn-gWacwPMZ+flMxUj0$W_l?c(Iv?Wj_B8{k7gUe6v+y0?l z**W+fA{8;!{9L|=lPSB8Y+^DBl>4Haj2^S>Byd$vOrwp;ZYd%;Ny~lcr>8=h8oNLr41c} zPz@8)an?K*?99-9j>Xx{EmZY!eoM1nwVI*edNxBs;w@v0GO@5fP!7p_Hfk z{Ygopwi1y2$&hzfKPgMfDPT9YSSj6B9x11yG|QdaHSg2%th-+kE%bD#TUM-HBhR-@ zdf6w^C&NBkjg796M`u(Pft&?N^n$C|W|#LjFFxAE=RX_O6>O&BJfG9w!rsg+Uop;2 z3TOOk8M8!xyYoKWem<;F<-J|6zM%h^OOBr}?6DOtnYk#pD^+tCy%>y)m;Ik~3n)!u z)(LF&i7?nde))WvTUWzIs1=veEc&0gXsPOTa5oB>-`7K_9&M@owl{{zr%O#cn-E?5 zE9VBCWeAfV#42)~<73NKJc3N6ZLA4wbKqNlOKZqU3UuEvo5 zFMnaCy~cGi{H0R*GyQuex^fEBI`h8Q_CzO`7S$C`IpXNQtlPC$!&Dk~r--T1DLjO@Vl14KaH$Sj@5NQ+iMAO*4usRg+ zxYUFD^628u|to*w`sF1@&^BoFpX)4x3aQ2)qy^}YjOGFM|3t@jT!44lmU zb$xX_;<}#{M96B(8IK{;sDA#JG>o5;D3z?OGdzLonz4wQH77ioXD%BWtP_)->YrK3 z995h4C>gjkvhAj0!hHszUGn@&08J6czbMIZO~4A0k5Ot68?d>GUFM)2B?-$Cy%Iz}@(x$7gp~!mw=*#~P z+XS3Rinv)Ao5N#Uz5TXN%Wzz~Q8kJ0aLTiSo3dC!8qD{Ie7JLRiJ`u2h5i8_(Y5tn zrTP(*7n^65;*j4hWwpk4UBMu7C@xRUTWzTm{|>|G7%M7P&Ec*ihOM>r;RsNIJ|D`z zhFX!I*=XnM?60E&RjyAeS_PP-oubvIy<;#0xAL-$BMcU~TZlSdC@dCWiCDZgHFP|b ztk=#-dPJUF{_%}>(;o%oP8PLV-<_;)glY>|v*)Wcz1l{QM%;3LqlcKST98Grw(j9v>Y7NUm8v{kR-X zTK{HY@zJr0!*e!Do^;%5N$2G0#RD}ppBpgIrke8tY9ll4A;6-HXD$Hp5(_noabb^r zS;@pjaO6<2V5)Opj8SrLtgyYLJAjah3T^D?K}hk|O;l>D0jV1l1hjVRnmnr`48~7$ zU2Z_k&s3vY-`~J-u#DCl5#3<>Ae+NKmkDtbE;YvV#Rnqu$`DpI>a-ROVC1Lg92#nn z);c%iQkIFf)sZUEr^H>LmIhjMimR~NENz~R!nNXfk<45JC_0C{ZnNmMxpQV**2i3@ zDzB>6JPvkx!_(QgU0ImKO3sJ7;b|L?3f*V{sC}#NK-C-CiLzDJDMV7euBNMLuw-rM zl%Lrozz%b}95uyc4pseas`Q6EyZGqb%;G=tg$Qg%J9v06*tPKAXA;|w>9YJx*>_6e zqCQ|{S@j!?Z|EPol;T@ed?O|uVTFFGj&JIT^;zw?8Iff5ud;BoHTqkyN>WW>E$tx> z8AaNjAi4vhYy2~%gx(HE4uk90Tg&Kwc1nh7x)V+_;kDmIe7mdXF0n4YtL>D%yMOM| zwxj1BShAZ-8Qn{50F`xOtoE1O2beC~G39nqW*)yE*;-i}10Tra3ef`AJ_r*2%F9jH z54rA4!&XddWX@OcfuDG$z8bC5I3LDGj_ViL^r;buv<_!UX5-^yC^i<``*stffw|(? z(I(JdJ9?b~{t4HS)8_g*e-f14+Smg<1*ImI-dh#?G&V0P^J^RZo{1m48F(TCd$E>nBL)*CHfgp;dnwTki*C|7e6V`IAN%aE7N zlsWI!{K5uYSg5x^Jbn_9oI4&9}Fr3wYR(td5dUF z^oIVqOPZ1D-5v|DMrC;qPF=7vwmlt)^py>an%lLM4=@qhHlBlRko^$l4RhO7-bb#L zaGp+OnEn__>oio^AXuA$Kf!esRpHt`1+_e17`{n91C|uE@~>~nf8IZ^s)pe&!1$3= zls@j9WdN<2>q|d)H6g5!&##~qr6yU=eT_)Q^+VR zPLy?x)_{>}DP9wEwH)ySLiX2QvSx&)5HPc4#vpgd6YbXRlM`F^wGA+bB2w`6bvVIc zpeLCgs04@SW_jQ08$;HP2qD%{WyOn*M9Aw3jxNoO^e8xa)0*0WD@S7_cu6yp9RtLN zm&Qk*tkI`i|321F)y5}i+Q+$01&z-*2c#`^9-nlqqRYn#Kty!uT%Cq+A{?=fjyvy! zzJQ2WcX7gQ9*-xZQj5G|Cb&9P{!c;5-Qx4rA5R66s!BHEoCXsO$J=&>@abST_dQpU z|LU7+87{SZ1>v3jZnr!?+h|Zj=-E(2<=e6Cmp=y+o9(JU-Nt(^ zI%U~nF|IASo`*_S?5f;nVNwe|AD7pagR$IP@Dq{nw-+Mv#{JWyJL)`#i!hR|W0>2l z{3Ly%zlq(iRoVcC7|KVfToVTaH44!V=oDZEv5JH}w=-MU)?hnXITqGFaC7H6T*@)L zpz=%qUyq9TuKjb$1*8eIg@;Z3&7%voDM0~;P-&@qN);K&vsbJg9Pz!o8A|CZ{*!!N z*%Omf3$?s)6(XhQywJ8iT#ZSBBQ?BTlP8Ayz_`~UG!8lS_!yApb44A-oUroa$fP-B z4W+CIlbDRg^^Mca?rj*@hBq4jg<|e-S%cBG?NzENDCzoWvVXLBUrZzA%rw=!k!Ayl zd`Iceb70DP`CQxPGLMkr;%_b6P%NNiN;}(L{b?^IlBW|?+%|RzRNA^_J8koNST_1i zB+ZU^L;sx66Lp9J{Q{MDJHD(`#G4R#n=^dc68FuRB)v@pYnBN(l{~M!w?ZlJKCNqF zu6<|S)<1Ia&f183dyj1|_mVqcFL5q3zwsu|J9Ed(ZkxUre`g5oS4`A# zxc305<+W8o_ktzV+`de&DMNZIANfKwiLi zzMWz41SXO*#hB#UBKnhg=B>M`hdl+P*9}b2!+icyC;D_uIL&9~8B9u-wcTfXJmb^P zx(BSO&eLz7cS%^QjpSTGb=VgW>QdXTx$Q+rF^xg?64N7cw_>)B6RHFv`C;8uL{)qgUeKoeGI15AA5Yo+8S{W`NyT-ool8Lhr%gDEe5CIABIy1%it8dZl#7= zf$gf&^zYFl5NX*XKSfG9vL_RNtS!|Xg-ov)mS~MXM`OBKh42>tp1_Ym5rW*$ zHgrCe;@Eo+D(Y0 z#yg;e0w2olD_5+bHMo}0;v>!2w;Adh&z5a6v!C9O8`MVKeift=%aOlCb2XTJnRiPe zz6Oz(QR3R@dM!rYc2%69G1n@~mYnQm8i(_`fjZ}{JUAv%UQn-_)5*4hdBGKS3#qvs zh|Cq;(9nXeQXBFd4WX@?PJ<;M3J3YjLVEGQU(prvgY<{jGC@F{r2pPmFl}x(Chz~% z8ESSf*mdGgJ*O(*lBb~wo66`4h%~WnqA<|TtKHpzOvBvN!WQ2Mt8RuAKivf)vx8fo!cJI*^MgO)ur7-gqDf4!)>hiI+aqf=%Xb)&} z7rnE8eB8%QGY{N_QZ-vAjE#5a?(!R}QRE&_ZV&!_tsdN)`?gd>k^5XnVk5?|@@Tpr zle`Ym3~Qq*1);x-Q$y?S4`L$Mzbe)LA&_+TG?mBu4#2Wrf%f&ShoSVi`rvIQ2-$c9 zo6iB3y@^@Q7mJUQDZg@=_Ogo) ziB!KvNY&Ptwn^_BFsV`^Cb-0w+V?Gzkc}>N=IwV-ay`0h-*t8P!uLHx=PzoOH;OEzuogZQeQ*Qt+X*moP%OP99hI+*Qek=f|c!AK^O5m>qSk zBVqYpIvMP!{#hPG)i;kukUo3AJRFmoHQ`G(T~0a3Z~M1bxa{KqIaqe)8I~wqr-+H# z%jA$>hgKL=PjpQjV^ju(KM9fn(rcgHC+EhQxdl3a>mb@$jvgaVT+QU*29gp z1a>-D!w}yFiU5#;mPv^k04K4Ez*RcwS^d+ys&LB#7k{Tc$k*9<-kDKp-{%B-wewkx zI_CxtFKzKU502CaR$Wr@XwFB(rd6twuZh@;=>mQrsqI^RPhIF*>6N|N^)BiY*)roR zfD(X6u7{g>iXwm^J^3gL>0tl+RGssVS)C;K25!qr)EWe-SWGeqtqb5XrR%;=U}! z&AAGxUzL}~)&1|jKWdG)igFFI3q+t-7>@2~zWje|*5`XTKlr^HQHp!MhU76^1S(>e zg$po+S~pYcuaj^Jwc36?L+v(%$`~A2Nn6OXhMnKk05%0y({HP<(?E(O9J8%m)C5c_ zVpc`XK}nb$u{!-cB5%H`eM#o~N|#(6(pS==~Srn3h?Z<$*BEqj9N)Wy&EEIb6IR8A0F zFoMz;@~nBMjjs=bDZ~mZebt>G!N`(uk?wvBNT%yxa;L_XS>thJQms|o=2-p&iu5Tm zn!e8PB%J9(SkDSX$TCQ2z{==`+N1Ea>v|0ohqOmJ)4^mG=^Ni{A>Ap z-nG6mt)sr+GIf4Q3r_PQoWhj;S_A$|m?T_7k8e{g<7He#w1TznRVcFDRRw;n|8bjf z_jM@6Fp4NYcS+;p8`!*d19v;h0<0*@&iL2Y(h`7bXL@^;>}`z-iPwYqK8Z0MThfv+aELwyXKG{xL0#q(6sK)dZ9(&nAK@fv$C3pMVm#!ZmLA#GVjUg|CmX(k>XY#>ghEtg-#Yf;@chnSN!*3s ztG*);alJqo?BJZN=~~{lT?nS|1JN{Z#sOt!tl-CZ1AbO95R5MtqKsu95#tM3T*%S6%HRrGQ&#jvE(KH$&BD5goLk~jxSnu2oFPNQ=X1iNQqY*(|cpVuc)!9c3BoR7&{ zXU1AURQ>h>Tq=JDdyGg9GzygR?S9!_-iva*v%E4dcAbVhT%*|lm<;Th*-@UOgNWpq z9@acCR%1Gk(WlzyN59X1jbD`Im75k`d7e%ImAKlTE?NaB$KCSz|@@&mM7o3n{tpRnpLVbk7p2Gxu;eA zvw6Ji^Ua*lKW>4q>5I=}lKFK_3wQyJ{3vSG#a~PcYGSIwGQ0$)y)%Y(k+V^%V{3`( z)&6In#uNb}cn#Gpy;NkAe}3!V8#bS$FyO0I*AB8@(-Z<*If(d8P@-r2oKwR=*O8lF zZhx{+v5rDr*s6EOf1LflkWq`(9D*y=q|bCet?f`8A#^J zM)F}E zI8(mG-vKEJkGGtl^gSxgY^)rge*opQ4X*}p3PYYR=Lf37Q|TAZaXf;&={JJi>yk^$4}B`-73{$i5#(Eb zEpm1e6#2%7w!VT-#>u=bonw`cQ~FyD{>F^dHv624ORH^;-l7GUW6fz;8Qd~4>OH3e zDYF=jEmqeuOvJ|myRnS?JPwa1LFU^13as*b1#r!W-?|djFw$3QzL3w6efQ$6k2o{viN)s=(!lwhJW!(mYsqAQCn6LyKpdfJ1e=ZF|7vGBYRvKca79*5b{&2)!O?y zNSQ}AP!wEtqe(4=*m|mw8cjCh<>lWlR3qFJu&M2a83NLbd`VjD8G+MgY7W+mH|K5} zOU)lw2JCK!UR>2by1v=uu7>5RbK|)NQdjcuZ0jg(RJ#`WKh=6OM~-3B(Pw977qaZ7 zT;u5Uy1%LsWHL~;C(9zcEf4X+UbG5ku&wEo%77+}fAurQ2AA@)?2K!FY4va#NC#m)IT|EveB4tM&x~_C&vHs7PtkcCb#de zZ^B!FwCWAU+)iuV~?Spi;pO6t9uyPDILGHnsT6(j2hF zHCD%!?KE`U-QQJ1XjAV==N{}*=Wq5lMYknbss)Z@xzm&#>S0L1YUWpU8vP>(Md*+BdJNVy zE;m>{o}avMsMX37aB5|KfsXQI|MNgwba<-An=IVWD&VN`j03jt?W#~pZPET2zsS-n zH*eWbXwO3Fj$HuOvU{qf=kRKMXPv+EyzA75_0sB|FCbE9v$ems2GAE#8TS_dcD#jF zlgLX+WXr4+MN&B_@%G*M7d>xR(fh=IX0p(?!?HKjF>`fF| zSRrrQY7XWOk*#WrdpkbrZIrihmXnQlfaKq)cx7^X7m-G}X8U~Go%^4;@>30>pSV^k6}n+PQv{U0 zE)*#NeAfT^3%0D!p`_bR0&8x9U*M!WQD@nH={ho@j0@kblDh z(bieN=^wkadFzHXc8I(|v0&$J3$-eG#1 zU5E-cjm(@U0391*r#jE}baSw;HdJkY4&BM%Ze6#u3`~ZvYc0a={^|FpqrdIV?+ati zrn(dbVp3(zqjDr(pF{gN7xxT5 z%YZWq)hBA(AoCzf7S^m@;Z2AaTg`z83NHb|Y zhER@jm)=XyfTUj^3H%^{RNJIe04w4JO#_$YM(Q)=DNvTDW@e)8J)P%ww&l@h@*@!s zww*Q42A2cq%s8#(Ik@uj)LXa8&9A8zh%T?S`1L}5_q^rqvUj|Qh(Lxbra)QvUdlbS zDEQXBRi2j-H8Iw(^QvpPvvO>8AASu^wQxsh+kali9;fe-kOi zsR;}B2VIlTxbcd~0!S0CjY5cb5UD`>)@ANpI09tq-X6d1_i|4;d2Fv{`F@{B87(F9 z0YWDE6i?K0oLfH3BjjM46`_9w%k<)-GHCk$7?h`YYJCzwFYw@}K;FMH!LP{Rvpn3E z@`&qmFlFHQTcghxh;Hgzs;4jeWb8H^9d9w>zQSbySUY5q1CoV_UHfJ48-TpFRj>Ni zb&|hs@pm<+eTR_u7F9!GzXy}dPSf8Hxw|RrR_Uk}h*a3{zggcjghQTQVl0+T*M5sb zF^1!-ms2NPdh=!F6@Iwu3>_=R4JKvH?K<5NeG(KIjw)1&o>|kGf{wzZU2LvNYX`usjybXv(`wo7t6l3FiW- z`E>~zavnnVM!7+^EjQ=Gq_g%?%7e>vyV@sJLy-acLS#lv`b~YUH1GL~ut|1M=v?FQ z#fX&J21_k!0G7*cv2CzVCa=uAt=6tZ$+~H(#U>dG)GVf}B;#Px7e}DvGl_`2 z^a$N&Td+;a71)&u(Le7{;4|$L4!S zo|}@u(vLUiN8%V4UZhDA zoA<^8upDeJ@0$l*cOCSPSv`NKXSA)IwFGqllQF-Hv+>q@xaWkDEY}iq!y`D^vI@}l z(LV;JSxmLVjvvp@ySyg{zizHV$`gcn%RE?c}}Og!Ft*&F|?v{a!$oXv+5bqU&U^^P_04 zcrRhSsE=&*GF*d0j>LWy%CK!jqD>QF^?=vVDM1TkLhi3) z-MO`kk5PH;Fy%w>K0$OQ+NZokG<`|a96rUTjCLX#yIL)MhD;uJImw86k<^;cR_p&? zAk+X$ZP}H+gfzk#dtplQUx6z0mERb+zRr_M=jgl|DZfGFeY++cRUePMZ;>g|@C5xp zcE1Z#p*Ly`(@VbZA53W0meC(_TiIdDVa0<#V!V7u+0zdB!%{I8A1!_UP&j?qX*cu? zFk-V2ARldqqmq$2DYU*mj_98e1BQ+PlB*4>n?-!lMve-hfCI1xjz-itmJN!>^gkQ< zDu&*%h;+7E>|hMgB#!HE8mdKTdIp00bx=VkjEyJ2$*G5G$0%JRx9pr>{C%^Rp9FiY zeZi&BpA3;g^ZGx8Rx6YOu z!dZdS6Q)ENS`Mb`mLJ5d*uJ$Sb2c{Z(D9kIwdov0iAP9%;yf3MG(5pnT+%oXmpULq z+B7Q#hB(&KT3fAO5KQKNW=psbmS(jRHF8n^JmXV40QX`{H<-5YaHW?F;8N~^+8j3s z64I~EsBImqffS`3+*={Gs2gjrUEAn339ZAVaO$KQ-1`2O@|UQrZ^Yy^KAGtnV3!jy znRp+%WeDfhwL65e(ce%nO&8sa%WF1RG1fe;L@4jD6Ookns{WC&@*Hgvy1IX4#-bKM zUz10i17DrwS}2|HQjr0h&l`gUGUc;+Hc?Jb02 z`kU7dD##Ql)4suL4mT=HBf8#Jt9<3iHk*6YW1U>)p}C}CCT}L4c`(v4&UV3@cH>BE z<%;1|bPga{Zb9tiYk$uy9qoI4pNKhWXSYWKm+jZ97Z(4%4jXTq1gkb~^izhugZKRZ zRNW7VpGBH4dJz#3S42GG6%kKP){WV0Mnpv1Opcv&l1}V&db```=44&_Z@QCCcTfMz zpHazXGFgx7xE_zk>ttO=Jvpw5BUvYt$z(GbuN}Q!uUDdoctu1+L`1wIA|fL9^VIWp z#SKY#pYN~UdaIs#s_Lnyo_Z=wBzrSMOE%s-<;dHN} z<3#2?AgwhPZo>=|C`{~bYoUsGPW~3Z@cS#dElxB&fW*Pk!JkYy*sN|30vdMgv!rl5 zgoy%{9B?IG@gjS;=3)t(dbNLpT z94;sA4`l+HUbFl1JOBe{!)$Qavl6j_63-)#WqfpomTR60h*Q_vJF^I_ayN)xGBO%wF<3Ym;4yAQE5{WU0s3PYo@zDc0B&t>CGr<4gmYC~aj z8;PLeWnKCmJWFrj8g0(Qd=Je2v36An(32*hGmgx+oC#0H^b1IScV*kEl{e8a07S;R zF%}wUqoT`{c$0?lbI=&yYJGNY{)DxI&RNhZ(2Z!$3{WjGF-U6}Mq z3I_E8LZ*WVD8Q0{Gqv2tEt>VM85SsRCXny6?W3b=RC?M+nXG{Kst-^Cws&j}R5?!JKV^)wp+yVGO zu7_N7d>@Shs$#Zp!kMkBrkNU??%_o2-X}^0)OS{Ob=&svoNc{I*kQUv{#H(|ZSvfd z@P9?QnJ#MTiE=dPw$ro&02B!{Mq0O&P&<7{jG8G1#DSW2Me!aSLd(5u0ffzl5&=n8 zcAoBSTY=fVdt)3MMW&4N#RTDwc?8bn&+G$aCHjkkuI~FQ6!%9IrR{D|e#<3*;fW~OJupqTJZ^@&*K?F?U_9=4y$_XK zaClN@zrWJ1f1(sWfKG9VeUPHNc_jfat01UeD3Srp+w#mw8B zU}8Xh1ePptt#i?0AB8B~PSMqkAdjIm$@anc#uNF|+`*y%9Yi$)nHaNa6?hUBViXQA z|Hi11oL?|O~K)8P;}%jcEhIB5MmB-ter zpK0iDP2sF-xis=fO+j+h;V4>-n#%=_)n8jW*UImBrR`jiq7!HhwK9%!y^wG6p^s_$ z#hP;EOrM52iB74{U8Q1*ez}6-Y&DSUoL4fRsp)c)*{i4&qBSFz5|LzDi#WQ z1QL5XC&vf0!+WUgX5G2YeLqv1(RK&>KR{ytwosvC1@a+UWm-k&2#=89;71_sI$~Fb zSpp$iOF!&h2ENoo{sR*F4AR3=kjOd}pk#uSkNi9p4w!w4r03 zIzm~1;KL3l7vyzi{e4o6!_K@2lJ&74JW|#Oi$SvAG8%_Jmmspl&IRg>ODo;OkBIhA z-|upooZkoi0fYtRW*{TywaDaZS^gh-V;}tirrxbY~F4ON02sTihfUA zFhp+vsC!>ImDP^OV%$U_q%zc(%N2{A`SLiNJC7mufEqhtbR_ovH+~t` zkBym?`0PK4_?x#TNlc1Pxe1vc9x4^wj7|}FV;?)I-gg@W1J|~!pf%KAZ?T~g)#`&7 zjc!}S=-@H4>MPen4ONEf0F`Qu16Zx(X z4q=@R_3t2rB=yH_7EgNCl!IG#?KW=u{8NA&D7j3=W=6(yeFm&)C}UGGhCGW@$l_rY zS8_$4d=8du3MVx;&d1Do2$&^bKf~c}J3Rv8iyO2kQFj*LFd!R58aXkFLZHZB0P^!~Y|n8O-j@)SSi2>BRcR8)tf!sq@MeX5 zU0<>}-$En2B7Ym1pn`=Fd;vPS9!3W{0a|_`EQYPvwWmp-vY1nYTjq}G z7mFatoHh2L%ILZnl+0Th*@t z=0~X3aV0nX0)&IYE;PE~b%3m>W9IizFwj|<2u3qfW{w+b8j-V9cv($D;r*?Sz-?hU z3@f+A=Io80ljAM)*%e8x?dS<-%hpVH)m01W7oM{@4Ii=T-3IXSxUG=YDmA)wHPMhA z2u;(TiNYsuY+$9>RNA-$mYsQhi;r|xa!VXu>Ov}8iTK&$S^FD}wado*LC8T`h390% zEtb}{HqcqPzh*-4-OYe(#O__&OEl!$Ao7Zs&J+uTBG@z@i>SSQ_4S)|cdrisvN3LU zKET*#QzKs?`Q0I7!^kWFuLy5!t@NpXjtkO8(b+Vbia&H*cRvPGwU%4gvPIPNo#iHy zNwkjB;-~5Q({NcM&!V$(ta;(ox3eag=$%0Ct_gTC#KDX^Y64f`;`m+YMpygis;m8$ zVgb)dkOS`NgDrb&ep~6%I>0`_{CKF_r*&mNFj>qzcp^@7++E)raI-gw z(_#nEO-F8A;2+6XSfPGP*|hR|6oBiS8BHGZm~}L*hc{phM^6CBi6;D6blihTjRKuAv%=!J_=P+JYJ}i7Ap{Yr^I4ewW&ei~e&0 zmUW>GnNnUrt7v!W=r1CCRLEa83C}92GU2FpBAD`WeRrD)hZd1Zw3~1Vd{n&kDyZI= z6c>hgk;s&Q@2~kSn{nm1VbiuoecyE|X@fWg!liL%;~o#6V0r}vPnO@nD&aAOn4eFH z|7|p~uK~_{=@5gaymtUuM|Lkg_}xlR&4fwxJ#@?hG&)6oAEnZ(gJU1mloUZJKSU?z z%XTgmg(d-OI%zt_d|Y3Os~G7Nr|G#PSpF&6=i<&1_4%y+st5bz=jbLox`Fb10Z2hP z^*|gGrA}0&YcLd88U=(1uQyDz@McYyfUki>*jXN8N}oWg*I>k#`qp#SgO4tCE`I+G zX2yzzy1zCGOUL~l+I;=N^fNi*_w(#Eow$oeLa%l0@A1|*79W+ zUFW_Ynl%(gARJE(DL2%2BPvJOv6n%#*N$=#?Q(=Kb=gq}fw~ciUn^UBXczfBxrOkx zW-`M>d+g4wMymYwnI;rwTmAW#5dU?^iKXL4STbG|U_8AQji<(^aICOk-vr37cb8k86=-vXbWeMb zvbhbUK$J?v?TAJ+O5n(tZTe1Qac;28Km)M+wwczqhblca6Z9}TqQzK2~2Dw-&~!2|zRbhv7WJo)lBdBs{AXCx^@Dr$Jfl@WGGxrc4^JMlzU7 z$xAu?wzIy>G)g$}WH(4%O7Qj=6Yr>*xtP6w_gJ`A-<4@JyHHTx*;CVq2IB^om|6G2 zNRVl|91`7!Oo=}2>*vk=nMzJ=a|bCONR7-HfZkDm+7W&W_aX@g$FDf|d|&-h+-N#S zC0CDIz&wCVeh=&@VyHM(P??6D-*xPWd#C7(c&DIV>rbr7K*<5T5k%O~?EhaMH@DTLw- zZZWYv2On>wjtQHl92`jm+xtW__jAC!=4zRbS!J)9l~A4NLKunETC3$7mH52y>vf-UURa zqcPsUhlsYe@`^F`{Upl}r<5Ncv+sA5#qWptvudpGzllnLY`$e9Pu3p;vQ`e6~*qZRnKcsQkIGT0GFGT`~ zwL)ISF3lyVY}?gq+gH&e(AiB&zrGBaY-{#Ui<)(LzO>%S#oo|0u0Ut|(4@!4ca;9T z0O&hn?Su(%Au74c6ACve6%Yk2XXh7rCX5`daCI>vxw12tv`YZZz`nPrUIttm1yso^ zC$2`%_oc06ytoF^G*8$GkB6jP3(KxfOQ3S#*mWS^TTg3Xn^2wWGbbAM{8g$qK$@X_ zW_CtoAIl~sT?SUgII*vrE=Oh7tVC&sx-oOop4;Y*%!b>`3P=+;@-a%u-xXQ=z5nCD z_V1}1e#eQCa^`9^A{%F*oXTiRTIdFjp$3a}Xe@N`F*U<;He5>#t{wGN(XZ916Wz#i zMC&XQW*00u*4WWk!~^5k{^mfq*xu0|Renl2vsp`{J^C5tRf zL%Eb`7$Wn*SSoGxj0bViUG%0=WR`Es+|wMevhW!F)3blHZ2# zEt&Rg5@?t8tjDaFU{;WYL*{q90J@{SO+6*_!-%=|4Hs9i+ z+45=dtpk}MHak`oOVdYcswOl^$i<@-+-^gu@pO0$m|aa>jMPyAtbPKVZMi98%SZ1! z2*MEd1H_Z{wGsCiaws#OD#%mdY@uzVgOTo;nh%+`7p->Bf-sBPRjzw3)9`aJjkVfg zuF??>p-IJ9i173IJP`W)Vb7WW9pm+yz^I?m(NBT0?b#yqp#CtTyjjx? z?sQT<8u~3zGx)F-jg&x+Z^N{nvXS<8>N~g0N6vir^gFvx{_>@N3akBlnXIty4YJ<1@5!Geao%=ZL2H)3g5zKyqZPcelB+oYVWVrWzP81Ls#2=)e!0{u+?X9|_;+ zI7j&nh{UapYtg=~=~wc9)tKJ_vQV?dqVhdjp{9z6pYd<=+@{WRV~g$HGhu3I=!${q zEL2L}v}+tk$nn{b%+RrSDbA_14W55w=|JZqfvFOJc+}t zd8xw%0AJf^JuGZ_Y)xJWNqI0u#9lHMTm;J!bhLE0P*d_P!q*p@PReu%L?s3Hl>AbJ z3dI#JsLN0qz3iRf-Q~#4zguv)you;`<-4g!;+2SI+~tlC z9cK|l-fT=|Bv_o}st8K#Q%jIp?5<)3T8a)v&zAd>uSUt*RcD5uhb(XuasUx6Zm)^>{ zfyiDl8mq?@$gJh5NA(HAcq=69(O*u-twuM)#drw|+6Ksx#}Us_W}Y%Mt*gmK9DvbX z+5yRWcWnxW5+hAV#du{jhch~1S%Pq}jajU#W;K*|yZ6-BLbRN=B?BAlYvirOnYovmcb@^I4y~8?8EjcISVglz46g!9nZM!7%XL3sat0!cw_i#(4aHOjg$A|1=;JMX-TqTxA&X>DhZjTMAEc{4@m ztQSzp>0agodAxWUFz;!rlL?4((oB;t1G4f4F>kz5X}43#@>O)RhWkZX>y%@ruhmpM z*GS##XzfrOR$_FN(W!)j<0aOLTD%Fu#GbV@i<&`C*;RfU-HdZ_Sm>SlbC`h`JrCc^W4s=5xx0Sh);G8j@p&`tp3akq+DtJIV3sC(1U|tpc~a9Od4!G zKJq2dM=~TQJ4k(ppOCru>q?(`Jodo8L1)93TVZGxfGm|eX^SuFcQr{zFMl6i8Wrdq z=IhD?a(q4${d$+t<4geI8QEU8GtWXP_sG;LH;5?W+2|ZM+PJC~={YBpXwz5?(Js)b zUBt9#teV#@;Hq5gA0nwxopp)xf!PIHwde(zL{=1&p<)}NU(_UbGSOXx$~yKr{2Q#i z7?Mr%(>pcjB}fg?(y?Y0#R8Em>d2+bW%(;@klbI6$h^XyA3foU{Ml(N)9ivISP@LJ z3lU8(%Jtxt71%Cdiso4a$lL~Odxavi7?J}PMih;@=m<-|Ih2IcMqUNG6qfR_VmxH~ zYP3rAi!UBAldnPQBtPMDd2LPEQZ^y3LpMWPxLU;F>HV|}Xnvz5l&Z2AxuIewe=$1u z_}Dg6*s_|Nohe0Lj?PLrk5HBxH$obVatvd6rpBsr1sp>d{lcWBDO&+qkR8KT3Qf?9 z)lhwH?q7VY&9kOy*Dskn5NhZqrgDnD9hn~vubiT2WCFs#50lLUnKf!>=&|t7LR~N! zQr{%#L1YVwPZ#I9HUcusSi%QaZ-r#9b5zFx|DyG5s>vpL8RtpxW=INoIkOs*%x$2o zTyc61$042m_KK=-`pZE3YKC2Icq!UO&2Xc5qJwck^bjZ;d`CGRH;mR1dI*XvwxTeH zv7{7n6q!{MPBvmXj{&lTVZflg(l|iLpYAwDRwQi_Mr^= zD!LPvEYf?+eyW7Kfmzs1Wze_-o#j!ysNh`y<=yRW(=b8oslSVU7vT^00@RMG8Jf_( z`g4cf4(+%fkS#U8fV~@%U+iXrq=3{{ZW5i?C+-Evaqib{dZc;O42(C$ndbh}$WH2( zN<07|i6?)dH$UiEdBe~fs?I}5awNPz-ad>-4wS`Pf3cPw05;41Sn`|B`Boj(+X((6 zev`3#N?AcNC)N(@u*Whd>BD;uPZY93>B4`hmlXb;i0F94FATua6y z@Tojs1UB-k%M}?v2~vVpU4PBI;W`W<_%e z`7TVMH|OtjGe=b)lAqm=w}^A{ZK zyvgnhO0iqG_r?JGe%p^O~mJg+FdIKtjJd8T}cI>Gw1Ex$+r7eBZBC$MQ zjBTlI1`HI&>KdADMH1uXGL6>LZ^s^r9vlm`)ljS~H>rlB2V2?df zKG;NKX!lF%Uu!qSwJ)wzh-T}^+@`k8{UJ`$btWmdmF;Wk6v*Vl8wX-;=s^>emFp=K z`PkD+lknJFx)z&>CH7fS2{!p5+m&ZoS?6ztsH|v*RtC2rlBaH9NsHR;pzJPFI|>K; z0BWm1yF(8eKxTm=h&aswEkEOo$`5B+U8mUgw<7cFYs0WdXhtj2y(4dx8G|%MFdLUI zPJr?g@BGXEyN{`&(u0FDX5VRaHgt`1M|%L~x6CZXV{<1Y+a>&zcB8Ty6X6bi2U4+n z-A6=UKoC1>H)SDz2tyP_0g)xSDOCk@ex${6BZi~>HAVkaY_;5t&gxj|3n8rE+yk0- z#S$~kDX1n8^}T*jkC?WJ<rJlp0j?f z2BL>hIXG={)4d*}a1UqdmbkL$K&D>V-qN*-l8_W7D?BV7MP)VH%S!jLN=N*0N&-5o z-5OVu$@w58X9Jz;P>C11H?}~Zgez<;Ij9SW6s0)*;XBXNd^#K5_SsC&Rop)Ab4dtd zE$yM^7dBPfB1t$OpfraOc)++zIq;k%ccrZEeiWT0?e29n+%R%1UloAd&&LsjfVG`_ z1ML8*GyU{G(F^K}DpaC5zX-uzCx~M|@+2xhaN`O|T_E-zN>sje-V1@tSd#L;nlXWcK-v_Aq-!7&Gc=XNUK5Jt55Q&3p9DB2b zh(Y zYutON$X^1YONu3F&PNpP7e;-JYDRI^I2jgA_Zw*TwV2pvL*l;$C70QJY`Mim^&M<} zUWk2NElLRh;#KjsJ>$fDt>V@t|N5`5>`kqxHf@1_T&PHZQC>Z66;E;fGpo%_e z`^m(QP%+LXKMPqh_a~s@?IXpCc7DyIm{enaU_iM5>Z=p69=p(U%JD>8aH$p-K^jR{ zGui-Jxnw#TY?+Di9igmTT5||*jNtEO5cw~>^Do&NJY#P$wo?XBnfJ~!fX^Sw2Bjrt-~Ho(9jB{BH3IH7W!tJ$Mm0#eerw=VRArJ02ZJR0n3 zRFxU@HRx=>?Q@S$_Ll>2*H(=0@lYP2zYdm-8mAd1CrY4<>ubW8sPWeR8Mp%~3;BRVPfleOp2vw4ep(fx(PT*Ig^IH}XK1Vlz zgkY4*c-A43T|!HRXQ3?(P`08IFLZ$GstXat-K?2#I5=BDmD;-5cyW1b^c8rH+#$~xv(Sna9gG&R#sTDx*eUPnOQYl z6Vs&Gms$MiE((E1Vzo4)vk!UBR$0wO!x@HgqB#uACM~<`rQTa%;w;Q0I;Yhh1)I_r4V&BWXu7`Z>cBjxJzF8`%e2$) z1m!pBX|^?J2#{=Ux2a*p9cUFh`S)g#yF6zd-uOlEJeJOTDps}#*zqY0F!i$5jUw+u zH3C#z7xts&xMo!cMZt4O#@s<-a8FI>q%wxwi%zj<3EkzsOuJQJZTbMB=^++_B-4i; zfMsRe+*vyNgEbXXX}QajiU7;D4CAWfRcwiJf4C+UGr)>#w+s)!ni!4GJDD0$seYCw z(ntKF+$LAa9z`f;ai_(R$B+bTC(ZJSOvadIVRjIaQr0KPsro(%$nr)cu8^~*Y7%CT z&=jA^3?hmf;{rm)vn)r?B~HO4T89$9rIm`1xUFI3I*h2|6Cu=T3Lv7%XC&yx2z(US zG=67l6w4qZ`-}JZ97M*5&8Pisb?% zjxD?llkLW)f4+jm(y47He@=ZsV2uIP4W?yCeGQV8>6vl$XUP8RAcE8z7nq##oOQ69 zMkRU^orN%D&;8nRk>UU=Ux-oaZA9kBI%Ul+k9R=w+Z}_)yPh>xNV)0D2>Ko%nJ^j` zdB2}z-*~Rm92%3A-;EG#F+i0{u)86BE-_C z`Le*$n({M5etlDDaGxVnl$=V7b?z4cJ#Ox4o4<4j6lQ4NbVK_pe;p9rhuD3M%7S!H z^QnM*1JQVGhU#xU`cTWtp0#mh?RUszD?cb~`5v7gcJ*E8q-Xr`2Ni#a@?I{fKNG4r z&EDTxsQC}3Sf35a>g*1c=^SKZRfz?r7-M*DL_Zhq>wFqg=RFUNVY@=VI6r?Qg)7RX zIv1duu_;b1{A>7BTnNh#n|7z;^!G)O{P2oc%wJp+gxMx^&r2XF78_X<$~xdukjlhl zAcxCP$&uOdFw0zyZswvW`ybPh^osgUT+Fv>2@n;T4BkQq#v`F`N;XLbBM_OB7a(dNnAUYKSj^l3oKKjrK5QaV;XNF?S^T z?{%5F%U~K#g*GbyIE9tPk@kj6AKoR!gL4@=i)=6}2Y{Dn4y%~vdTcK;DC8^nwq_)T z^A+f9p+?`+!mTx%iJ3u$ecA&ghv(+{y^)Ct0^SDJ^jkyATjx1z7He~rY0n?ak*p5S z(Gz}c^N}3^r54c-%HBl2smD926?-^6`FluF&n;Ajd|80`Fo@oYY6^Uy3yNv*pP3P$r)Z z|EOWlnOS>Wct<+6p3Wf7O1VOd_5e$!bmirDV}KNi78e+ED5E^o_sq-|Uop^4*7x{D z#i%+BASQP6@n;r+n_*ieQae$`i49o3+oRlLHZ))rr>{FeS%OaC?$q-x!2FV5U!v?a z6V4t#$Tb2#$!c%D7uOJ`EodJgD?IC1cNB6zBsHpVjxy^6cy|RGAEyBC0cNe&vm>)^ zQzJmx=U2!2`aWb9lVfw(((eaoyzX%Lj%I%#X)<9;cP|S=z8~hcIBoe5vQY~DXP)Q{ z%W*9GAI^kv1TR|YK$7x#>7UqZ9zo`Jck_c%??(YC2HhOR=YAEP_AywFgyWN2dR<=7 zaO{c9%gM{KUO0$M4hbkqBc)56MvB>wE>RI0aM2&0f%u@K@nd4Hr%Uy-utuI^4@KCc zH~uU2B2y}Q%;4#le+ZTxbI^L4_f^g<&x2F+sS_mTFk0IR*BV>XM^IUx-v02(3Z*rg z>1d|%fn5aU7%E$+grqx;%r;CN?w(GsKZnaSR-#nciFrMNUjLiU1H*&gk<3a zH^APP^A+;yuKMUTuK+S%wxa%R#W4SB%_g=e(42aPl++c zHQvxEpIiz+TB7gSjk-j zD_v_mq8eO_^6CG$;3_to>ktaoIyltJibZ0s2UOR2!AX^;DD5#K-{3cSn_A|co}YY{ zWd^=KMZ?ql%K=##?u+FroEst9+om?H#dGq!y*Poi*3_&f)4{6MkZ7c1ISsZUmG+JM zf-URvx0&r)+o#%*^V|=<>2e&`SUlh!QqyTJplkrk+qinNs{$@82$$}j`cgOB zkBuezONB5pXR}*DF&+<&g&AuTD(mVlEE{H27fAM>5}M|=N=`-~#oLitdj-=I`p`v*pQu}m8Uk->&ipOhVe~AT*6w^)8D?W4cSk1O zxPiXnF)QoL9**xp`{dT*Yq&RGD`paYko%H!JmjUlDz+bq4H5sP@xH$sk~N!|oQg<` zF$&+4$$Xn0!%|K;fn?Ds?$Lzz)g;?Tc9x<0{)&{PAsg}okSyp>IqUZzT0w{1fJ#3> z;LX%{^6g7Gf-vQ6r)l|b+ANHF^GWp85$jj#+MP_?aBg>NSF^CeLcD8?_Ch2zk zG_;n3nMAc#Q>*3jBp{o;JEGe@g;a*&xq~bko|7-*5k@(UFV6zALUYf>Qsp_cf|kHM zu|*gwvqLa>mfQU_@$<-J+(2)MEqEB955))~(_+6l0`lFz`SAi8hv)pnigE$svHJea zc}?8$d|xG2h5{XHpc56^Dr!r28&$(|w)Cn0mL`sOGjVVrY(?)Oajf3D@;+Kw zh6TLy`493}iF=9`{UKW6*)HPfO;N5s0x3JM(jvIH+7 z&Oe1G8$(ZM9NIqv#t7h62-*fBOW!xRt=P`K$e;U5klHU1K23Y&>K&V#UjY=}hSxT4 ze_h|^`BHAz{00)D68Btx>p7*jt5=8m4w+2lNE&<9-vg5K#cE*5N6+{_=HKa?Gd!G$ zOlhN_YudBW*|aS}s+WMcXM-|Z{vyQ+QUS-h<22y802LlFdD%Y6Un7FFu{?APKr`b$ z7!H5hFi!Iu;V$rtx?*#*^My5?6poga|4VaRlv&2kLHfF048V4S!@NJ0DeMv;&TNij z+?RSzML5k|zDG;B44Q4R%Eiq^>wskSMR>sPD=OVG7OpM}&{_9jzvfv8!1C}ki)r&p zq?+gPf<=fFhRM07!y_fwu{d+Eqh7-Pv55e(5MclhzpJH?ECfrHZ(NPeX5A8t;cJl1 zT>OBq#);%&>9`giT(`YQ+o)o*{hKfxPT+8)!s6m?nSy;Pr6osjjb35A9RtX6>5th>JZ8~`cz@b8r-6Xw1y}lmgyl$m7OA6 zD=3wTFoJCnONi7y2U7karkg_^8Hyk27iw7Sil%-62^ zi@#Vz?Lg!=Dcc6TSMtaIxOC}Kx&|Wqz%v*nah`j?{x>4LgaxNkV44xz+ zIl0sfZ_mplEn;3$MG%T=n_W5RAISUw7shh9=fX_gTaG+jl&NE`j|IrZNd2s%ovM+) z@Z7cSN83Ou*2J)*5)rxTGIW+CV!4EkKDy22P?Ai`JXEI5zYBB56`50;P&YAJFFu*CwudZpf_r@J7*Cg#&S59Wvq9el+rSCcfdFz}Uk8_VIeLcXaz}JMf z;W>+}w!e#dF3aSKyh3bMJ!m<=SYcrm9r{L87N?`RT7uRA^SRdXm}goMnWG?tGdsMp zI)9Es19n}KFl$P?E(t!z$?^a?yV87wg4i9RNx*Sxi(pig2!yh}QKGj-huxZh`fcNCB%4t{ zeE)cfh&_hNp{6@D#|dOIxfSd8q=*OgrGkL@D^%DxYj!`#&N@onDP zUx6O9@84a4+1}#5bq^pL#wy5NY{iZDUZ7&^>Wyv9`!a=br`%h=9Vwc?l@Lh}Gh zo#&pO7%wM|9?V}Atz#t>0+l@MSZ#QC7%6izFabP}Z%P>sKQst*iaKM<)BQ|-bcjq7 zn_Wc%c&xsoC;I&fbkQEzC*_a1#Vn_SC=~hQ$oAY3`Yy@110akI-y@UjF{H+l@{GB8uFp<} zrSwd++=lcej5`aJTq$?a3H51dA$Ut z*Y!u1mwF~4D~qqyWr&or?K5%w=5llvI}B{5lq&#PWeR@=v0IR7%GK;eo)*?Lu_;|{ zi@vhvGsQttFIWWdA&XHKU!_=##@Of#deD-ZynV35aajsT+3pVmNNzq}4OCIv2gQne zO=iL78@sO8=8xoJM>O1Z$gGHo>ZO08Q)KF$afbx00+sb%&zwuA$hWD#pO>}2;y82cMFYF?1y%}-?5Qto4=!v(YWJJN^m?o3 zT6r( zghlF)xv#H$YqTsq#G@@r)pM znqfY#)5$ME;Td|6hrVJUbB8ErQ{Mt&cz z?`t5^4#ylpHr+0E6^keyg=J+~nPMj-gzXs6=MA$4fSCA?0~*0% zlb*3)Isr-cuPEV!UO;EwjlQJNFT%7avt7v8Nt6a!bkzdtMgC$f5z6!{sI2XzYbl86 zs{q~7!pEYjEM`l@#@E2UId^KEU=>Z;orD{3YVqR}ak zWSq|#Mtp(DGFiY02CNWYg7PD2F8r#}4w>pcU)NVw_PXsc^!0B*zIO8O)%aV_DF8g4 zFqe&@BE>(PkbDpEi8^GSadN&Jw00P4&#a`NqjUNSmm?b;bC44l)bNI;;ns zn`u`Re~a@FIUW)I2u)lh^n7R*u)PVxcL9X-^^VQ`i)OtLl`=LoGqw$vFRCwu=WJ-6 z7ejRMzr2!h!XqJY1;Jpf4K7VmgRP1?!Y-??n$5T)Q6C^!)e#4@uBeI2g`f+Na$Rps zTj)`h-?;mYdl?`ynVo}jJa!SXsa2S}oc4(bJd2?iL2pdyU4qV9QYeb&=Tb=4t9xWL z28OEvJ~B|QBDe;Tx!NfxcDU%?*FsY~%Ej7oukM(Tn`oSYfWHDhbODZqcZz9HUR4ay!yas)gm zBTyT#`DW~1)ArUuRFqjTB4GTURU(1XC{CA98UQIZ{f?W_8qo9AjF5;5)&(LHapKp& z-GlPw;-wy;1b!RaQ5)-9;oDcnj9WqK);ks3s+$lhT2s_% z&T&NGepqIF>YuGC%%Bfcq&+URBts8Yq)q&uxK-t$nv-#JF#UHPhNa~7lwkM=(Ag<> z_wEQw-6H@c4kI#41tLq_Wf~sgmM_I)36@~1aSWmu(4WZHPW~eHu@54AWTd#9;>(k0 zRdNn3A}^(R@u^H>z8x!me9xds=i0a<`B~2-V|1_tTYe7Jj2l)4vO5F_xsFkSNIZ{B z$)S;MH!<1ssIM@Ohmqt6AQ^Z|9J7+-C?pxNWiVW>kJS{D~ zg7X?GD@6*7kf7H=P16;3>r?35aWS`)i1|eeDITV8*0igO{q-$$@-UQAnY|6j;dbp0 zJX7AO$zqYnPX8_-ixE9ZEEpOED2uVG+=28yI$68kx^3>)lnO|*M|LE-l4-9v^L+@% z&@eU7ED%{DhgHI&{4qejy>tIE_uJ@9pP(~enu-HA_CP9PE#p#OngyhCDJ;{p3lP3< zwV3(BW0us`GY)^5sms6>+csvcuTCeKi2>?sG`6(xXb)GKW>xzQG>4d*V(|PHNmT!4 z;S#!qXVr+KcWKc19uWO!AWD43OQlVV2O+frm96eDGWX>e1kbA2sROa&5?7O+4a=H~ zlu_2g=hRf1pZF8R4B=HJp)^^RvblZ&CaRH<$%its-T!>D-ZE-Svr?V(@5?Bt%+*o<7Hrq^# zz?x=DsAh{jXVt^5S$vz8fC$Grzb^HdX47c5W<4DnH&~7PbmkZ_ zbjr!`>k_+jqp=P6l{*F$9PH`&X;Fi~)d9-Va8_~&-;gQs(RzQ+* zJM6#gnETDZZs}T~zO$Y$c(%@JK-2HSQN}=a?Y5gBlyp) z(5zNDRgTk}AS#eogvYk(WWX72i8Q%s=X(iH(qeaiVy@DL6LbZ#A>jNsHI{Df_ zrf{`wfEjf>(Yljruh9Xmu+$g8bKpNqW z+t{5ah^5D5%}KEgSkq`_bRJgKW)aEeJ_r9xr``!N2@KIqHOy|5@0NW&t3>n89gwWS znxdZFg-(vKU^bfsU9?kL~B+I{D15;7Zng6=+ zvY+)ZAbAp3K9v>M0Z@Ljm4P9zgM6gEC%D3pN<0ei*_oGYaXjWZKhi!En?_Hdvlvq| z95Sq=4?>Isn}0+%@u>D~CHU=Ai2PL9{M!?&pJ!k>()5pSr_nSX^3Q_vBhw=hrsFwu zw*RSL#Wn=|yiPNV-yzD~UHaCFG$p^cARoYC1=++9l4gu{MOu=RgMW$}nL&81W3 zs-pn1p_@*t!ZB1+sZnKDg5!`Ze8kb+QI@_ZYAQXs*dkwmsILK&qV*yoxozVkxBE_l zvcF9a&pkc&M2RHva;A%|GUfe;H-1iq0i_E>PFuxh`HPwL)tYrHpS0r5^%_VU zm6Noud(Of}^im;m3HK?OqOI-d?qLu?q-2aa(y1%G1;Fqc=J;(icEtH7zrK_2t|`Yi z-bEyDTrpF8tKQ3(@ve8v7B-0Y(JG!>`k?{QT0ru9Y46-`cCdt?vJ-X3H2M)z(YfYR zR6J+VBU~v@Z&LpWOwr%?ud(L*G!t*=GLL@dIZ7EJb|@#PESMlz1}-`Zn~5$V1{5sYqS|Jq};;B44GzsXdN`^NC_Eh?GTl2>+pm%ysAfr;@lxqP4L zttw-G^^BJb8^Saftr+cgCMb)@gyHs2Dhec7(H4HEXV(OA?O|xH=RlNgM%r^d;?u~8 za-Qcod&o<_47KolG*M_al&)^@C6@IYq8L?{T; z#U+3&W^BX6vGPkHb)o?E2SYBa$%Y+v2k;X^Nv)k@J{|-a zo?T_ldvy{Mc0TDJnO0rwzLkmZT1e*D7J5%v-ChUF_L=F|KG$asoc82>L*@{2KkC*4 zc`kz}-|DruvV?eUawEs-=zn8kJ}CDRd6#7b<_iWY6{UD28JYH((^eyKs+D}VdCWmD zCe*M~g!yA#Civ+MR1J^Wj#R=}WOV=(VRx+fV<8x#(FqFy9~dzAb|I2qoxM)Xt3eNh zT$MwIvCQ3w&W1Ou&j;?h6-d~G@s5)in^1D3KFD!1A_w$HuVW@_Zv!VAJHqevc4SJg z0|@9obe4PzQwymW0QiDUp*WE`l*#Q0i2$cw!)S$yl^IrqCyCi$D=@{(8BcpjvK<9w zVFa}*8f0-m7|ZOi*wJg6=L9-isxbjkBPU_WnJ(d!Vi26J$d)Y;Er*Ix-|OL08297Z zRe@oKKzfMXh^!XtNfhXg=GPL8%OZ5=YUkd3sYF;08`OPh0=CPTa(_+hK*esr0~IJ+ z=QIpJ_R9|D2FL4NG8Truhu~TG>=_TEL#Fp~RM9jClEmHLZ<}r9S=*0Lye+j|%1Sq93?s_)M#u@X26}*L8?>3ecbIOyzED<%G+VT`S`4|0~ zvZ3nFz>+teaafvZ1kjWu2JGLPaQ_e`d9LE&$m33tCV<@`cxk~hd_0eu7o%Juu$5$r zkPT|Z5mbIqe3r0x9|bgBI;_1R$v>xkKx34PdK|B>@VMzy+Y{Q!7Ng=um22u1K3bun^E!AoJZ_fLrmF$k3}m#pX{qe`_ls zYig@#XZoqU4hzAbh*jFDB<3D}>6es^B)G#HK;J@OnL_;8RSYHhQ)f$eH!TCv3<{II zHgDgp^pNh%Hj=Ln50;}8@8_%GpiaF&C?N@-8eAL8ybm+&>X!EIRTKb4ccT^Mr5}F>ieVQoG6C zT|s?$`W2gXVN&@D(g>KYLByU<ZMmlQHg`>3y5Fu1#$1_Ft0^>-dz}jW|%42 zNFZ8#ZA-hYnoPR8T*+{K{&3X-o$3b9+3j0K$=os|Mv2Z44xP(U$yl9byD|PoK$f9p zw6B=yR@5XbOz`HO)(Z5G&OI%Y)d_G|wFex{)duhll?_|!I#g!KE3&K*+99gt5>SSL zj!Znvt`ho4CXSHAySA5OTV0SW|J1Azum_p#vC@$+_cQ~4Re7&KMi>GW%;?LI%lcI`lA6HkD-PVnK(QZ~CFa?e_O-h~$@8uv3-?rdtT%k0?nTEa z78Z>A>g(21LKOS?eo*u>rigG*djQ#R&DmZn=5ng-L1-41#yB5j^dX>(Z;s*PVb59R z*&H7Z0F>%4%AY@{lc1BAYSSA_)<+>(75$q(2XT1}lq1HB6%b>EQYzCEU@{gKA_9F7 zm7ij{4IAu}fTqZ?e2<+y3g%O%(F`h6HZz}rrhKg!oSi%3XnMLhJXR+wEWmfsjc34_^-_k-O+e5LrO?-^Ap16p$Y^EX5Jw-^K}ldI;Qz_Ma^+ zj(b$a?w*#e9y$oBsZp3%!ac;wb{)+U~J;n}08zcEuQzX6UOYg2={s(~6&$?~%jcr*mGm@u`KJ){oj)X;%l7dRstamE_O9)y6b~*_`KS3z! z<~tq8{M2)Pz@jZ`_8A}<(&iS;*!cJyLOSq`21_l;x7)+~{H5po3?;#$;VZP<$^>3U zpszDAKUOM;XC;jNSS|ByCXKPwK8e2h9iWkeaAjJdrv49GD`x#V?xhT$IwU3R_XxuRITc zQC$($|9sEncew}>7vx*QxP>NpAu2x>!wj7<4D}ZQlZ!Wx<*Mf5ibPc3;J_u2WZtPG zVT`&It&PG+PUS%O^k{gYUG6zWpN~O0^IrkV{0BychT?qk0!TKb)4(y_FRV#!2}g4u zyb_YU?2Yq5i;yZG=L)fZF)BZj2mjS<2}l`6wwY&_=Fe$YBnZ0B)u60+7%qd89A3jluKw# z8i!}O%^jYODQg8v4QXiQ3_>eXVK_`_-{8P%02Xt&)ZA?dH7@3!qH$6-Kv^31srq(1 zS`|%NshyOKz>a)R1;z$7_|%2=oj?1VtFEGR)btd`+1|KncOytSS8nK7 zy><oeUW)7afDTXLI`*}iMoG4;LM>Pux`CH&>-AVBP!d)hfF6N;ks zC8D%9uZ`1*^bklhl7>SCs z1xoA}3P}axmIsWw3qV}@cM;<~o|Dtm!@SVczDp)pyP=yR;<*{9V{K3z!D7_g4^Ixc zDn|dj8(me0*f&2Bv(`OO**6aMdqcG7BtVUAUXzMif+QOhqfkf?QMNc+_&}!9Qth#o z^|m=w zF}XhhNxthAJVC-zcMzsr3uBS>q~|Db8AF~z$Xp?3$F0DhK{jj|+=AE7qO&l4Tw65H zb2Yu2pQskd{JNcwvXAyWB)a5isBec6K06S%yB+atz-_siPT^7Y**!y=j-j#$Ey2R$ z$SgwM7<2-i!j{p4)%gWbe$^0NLZ`h5$vVegrM*>bJqc6xuVvTk-ipMu}bpDv@Dl?g&f&Ysf57TbZF=M<-LJMs~(t+9^N^neFwmyKZ1nf;Zu+ zFK8Wo!gJQLEv%MrBlEjQRXkYsT;B)bp$o!7 zjr;(W9O>(wDuJ;-gf!+&k$}dE`lCo6)r8xfhzc$1V@TF!!{|ty8TkazXngbb^9O+a>*9}BGunFds^=)Se zN`SLPnF0^r5Ime_QBy?m;-bAJiL7iQ1>n?DNOLCr{$g)B8rH0|#b50=13{d(yT+rk z%>BmnrfAn9n_fITV+T64=j&i8`UbN+$A5ih(7lFw;Zf= z*#kvyM`-u$y}SApqXR`GZ|+$|m#A;qM7OJw3CpG!T>_PD-76+RtSmP|6n4OoFN#F{ zdD50yoKW8c$ug8ld^0kIwzXFn8CpaITq_xq_w5O65D(1e3EiL%-~+w8PyUSVfXM7R zVo5cGR13pEh~03$s^S0I9I3`z^Hob5yc|stIvqj=k z;cYsJN={}*Gm|0j(>2B1ar(=cGk(k$nTufeBM~ZbCo=i8p$W3L8^3%Mt50Dv_+qi>O`$5?=$NOS*xf|^RF2_&$V@cYZ&4UYEEZoWdhj4AIow1N6|XkO^l9@)puPuG#su*D_KI`*;5hE+a}d=*bL`UB9X z8C>Fe$P=h!+R$X)*7gD<0d;o6o|r5l!lsK+}*KQgS-rppJJ`D@9;`eoQz|e z_*GQO`o^$@zJ{bym*sLCONsaj#bx7lKV(f@ROh`@Xl3Xu29q}tq>3qmmf$&yYt>i0 zM&5=fp*YdRG>lyD)FfdljDGrVCUIanJZ#nKJ%D1F_RI9~J~F>PRW`Rj$n-ehEb=}C zq%8Cpgwh}Cqs$|G7sFfPW28bl4nslkoQ0e=#)OgRQ%KWNz2OD?89Mut-AHi}G{oni zWYyXz^B2gJfbo8Qkf|?yS<`HbZ+?Z$+0)9v8H%_rho(>ynKtMONS_vdX}ARb`4-)D zPu~{GzX9Q zZEn@!0eMlA)J8<;85bjcX3G|bl&K7;Y!NCzF<)H@QN7|{bD8HXcHXG2*q4I{puGu& zyaJI;vmqLD0Wzf_)@;U~-_jCbO>sub-B4EoGS9p^LN<$P8vQ1Q{>6Y?Vg4|_tubVY zALgap<#@ii6r$!oUCIpMQPDWSsMs}#{O(`nl|9#1fZiN-lIs$PJ@)ZkgR;6FkSrGw z!xVP|T1gv!#9|~|)j=LX?l)~H{Nj!#Dc@WLdhM z*C^Jv+v;1KqC-G#2c*2ThhJkKQe{JFCRqddqh0#4S{*`?!>JhDhCOFNhr>{jQNFfj zf`Mr5QO`{ygwIZ6#2Krvg+b1E6lV1apfZSdW{{&$AhTw-;DItu0~(%+uI)uCMdm~E zC!T^N6z|2Iz+|TjbIKxYHw5d$&z(wv$b7rwE?Rm8Kz#`8Nl*3I?R2*Yy($c z{D(8{M>f@swLmN;9{_2V-fg1dvOb`)c;R%NaYP@2MNKVw86gnKd!MMzJO`jY!*#EW zGmq3nryfs}cPt1Vg(eSsiz-IlfcW;we;NzDXqg}7Gi3fSg)k-IK&6H z3=YMyQ=d2i$Q%U^qaVJ2&Qj{4BhzySedR@1a9`XIwhDw2HjzhP&Y#R)>ikL)Gt=RD z{wgA?Ha1fxl-Cj%5z#;60K5*!Pp>Ew!6|g3rl*6Wi)&)QdlQ~Ta=DdvLuB59Dbfb* z&q9I7Y7UN!Irc&GfMgRnK`I6(4TbWN?@Ue465sa_szF!Uc8kppRF3})4dJG*>y`!Z z2Y$$`!Gqyz_8}ytkL{*xw}`jH-&V4i_9{Ioz?wKp_g5l#GQ98 zH1}SFAmulZ@{2txKpAh+ECI+8c016C((A&)PwNjluvLV~UU zq*1CewEX{}NOq z>|t#9dt;OxRq6%2=@j-n=@{tv;AD(P*C4V`#A8h4Ab{vGW=&GLU2jzG0O>{%_}ANYX4>elRLaIqWAn znnwPOc>_ z{C|J^gTJ;jkl8YFYeaA#OuaMa?0y6m&&H?k&L8^9y#@Cm8U{pI7!0@ zxDS@Cyk?F--JhgLGi3q&K>f8dhU5p4$t5Rgxy`13-$OOg>e%Ie7@2}1a!4904}h{~ zxnjH8SPy0cb#5|HzN z!->{r#UJXa3WnRBy`z}oo&hFH+QOY&jh=-l!A{-qIgj}rky8A02%X~6HNJiB393VV zdy7rG(9I7+nh}Q^y7feJ;_^slCF0ZEzl2_W6rGH3v5DpsBH=lfX~^}qx#LE!bwphV!dKEF<2n7_;j}L0`dNvFzvNItg3Hi;_W4%z}tvy zeyw3W?frK^a(^Qn4-bcx@Ld3|uQu|$=P}!Q!nE=}O0G*6+nE2rb5@|u@nTv+zI6MY zR0EG@lJ?$R)=O*0Yz34t?c;p=#}VND6VEBvt71_96q%V>YvAr@Xr-j9m+hgb(dRWu zuUPK%0(91TFwWwYsPkXKl22i?mgp;h;`a>>#zx!MH497UpD|CMbHE}>wt?o!D7nuB?(xft~_XmdvEkFkg>@@D^- z)uQIOete1@;Rc&n4J%H7Z4GN{~t#MSXi)Y?5A!B+_FM>+`yN5ogt7U9gIk(bQk#$yUF$%7O7zjqDVgb@#^H?dOcv(Jm)HgOV z2F6Oe6OsjWp|!7cReH!_H@hB{9xF#SHYV+mNt)AdMQ5qH`-jU$&8CX1cMv(8o;Ft` z1TePnZ-Zd-Ky1g{?pc{<%9!7W$l`Pr1I$3BR}o$MMZU=HN1>>{-82$AFk8`C6c=zh zVZumJQ*;c5Vm^jW@iMGS0-6A&xHz5>(luFgVXEf#ORs>W=x%YW_~3A?(Pt~RG8kzK zh=Kvj+}HEQECHK#gNz-w7&`CpsG8&ESf&X?es#TLi5hNC1sukUcGwFb4g_F_0e)Zo z**Yn{wI7h*T2W$7n0fC8+7%je;G5Wc{hpak!cy0rKUasK()%Ud*53QE~*5qr$N5-iU{1 zZu6a^aIH-;yml;!-pMGz@g&A4f;fR_Mn$Jvq7f(#uq?E5xr&Br?Al zH_T1(=*^}2%W!;Ibk(2GA8ID2p2%_PRfrO;E4}zNMAOU>Ks2iPdVOORTS_?6or0K2 za-!m*Sc22M3G|VX*h+h==2FghhJEI3P>zO9y$p_(T|z1aG%K*GTUU4&8C}6e=2QxV z>Y8w4(>>2k0zp%QPJ_Uiz>m5$wq;AEEW94faBF>i-y!Of*iHOBz3c z)Cnl=oxYczk*^V3bLZTzC>r%O&FSPJJp+;z632tnMS2~7kuTZ|7#^S5V(R}AK(1IK zW54yQ{8_&1d)?bW)kq+0Txs$9CezalLfpSaHFL`e)2O{!8p%WS4LC+E-@NQL$_4@( ziWR)LOj9`ipTem)6jus+CK?-VaZh<)iB@?QpqZZ|RuAi)u3o~7oSjJ(k@sGd`5ZuU znE}XA2%X_vP&R~u8*tAalKIk*~<#R=2mTp0 z?22XOu@ISqZUp(#FRuhNBJq4~0aGquSr&(Elx=a+9S-7`32MpdOnA5I1WW4+bAK1B zmaEYz>n%6;&@b}mmiZ9x*P;o^DszqEUWdwtX8?)zY0eQ{U-2;A#!zD&U~ISno{c25 z2Ht9dWfdx;MR0pLDC@9Bo6t7US@B_h;+VDqpp4-h5Jslf`g_=u4ackVcTB|ub_=93 zbCHgrZ5=A)a9-W}Jxv3oH3c;z>m8Ze+A%b`PE@L+jq({_R|PvxUrp5mROr49pIp&eQu=|%&1pD)q!HCRww`bMa zIuSdX_tf_{gtIR-1EB0HIh+NDeeOO;vc&oi6taImEQ{z2Dm9}fDHnhqjRTYq!W!44 z!M9Oz-f- zW1!3!?EUscfA`?RiyS;?9T?f8aA%vrQ*gwwX{@;+%EQw*>RYMmnO((W_?bMK{Vjh; z1%cBmDt^WIH0AI)RB~6rv1?3t9+ULnuCN(RQ+WZ`e`9XZdut~KIlhP|s++n$I!T}0 z;fIYr7L8uULtSz)v&+qgPjWE_`Z}z*mwB8C&RiEc_ zEPD^Dh8!^{XYT{`wcevCQb)RXA7uC9*YkmNB)$=~=B z>lVl6alP=Tm(ebuZq)IuW9#9iLBqhNS!C_2TEPiD*DRxLAMlBNT4g;y36hn`#V~0n z!;wQdl#=Wzh=_mh+|0tSD>Ceug6hTB&wM+aQ*NeSE zV5jD)d~W~1n#3wwhR_~_FZA?zKRn~gT$?1;ZJ}?ttjz76EL~(EH;UJvHH!^!UNFcFD;nrV9C_r} z&P_n%F}%xXW~jek?=H)Wmv6?Up<1Y(FK2sVx)#|r$6vM4vUyBy=~LTY-n7?&-ee-9 zuC51?6@g;;R#&-h=(+WEid>uQj`ZAKLpX)Z+eUH8<-(szY8+Gr>Wkgp&l4~iY^~f; zIiOUC$A?L43LzT<%_zAm;MH3z&dCg51nJz|%06f6z{r74tFUuWtJk`QL3GcR^F{U;TyW26TDRT?nA;UZMu{)5F-G&VO7m+?*T5)~5yD!y!en=+FaD`=OtEGV;NoC<+ zTVH(uB9-yUIwDW~Js4a&&S$Eu59RL7%Lgiq;KNY&X|CPFoSJ;HeVQM^t5)Yf|C-eU zNImQ*y^0EwU$@OI{2c|PyS2sidjg1b#anFa?^F<6{|?hjv%r5_Ym!glCHwZh`lpo( zdI&<+%YCIqOL_*@nJ@Vf-NR*S$o7}9((9hX5dBtLr)uW@Jb1>j|BYU(q1E8|QuUlT z2T>*ej48w&Da9(9k5asd=<>=^6t~ZpaGKwtpHnVhc1;Fb*`aA2`Qhv{DME)_d-dYZ zs@K3S9Ys!9jMs5_+3E@YIF?cJyvWKHQCushy!t}>dcDUurbV@d z0jq@7^54a#rPmS=rG=7_XRcXK#c&x}DQL5?{}!WEf?CotT+6&^=}3J?KMc$XBNPl! z))<20>+vzHZCP)whVTiLQq`QB2tUp3O2Sh~!~G17D40WRBdsCsb6m1x6=AacbH2b3 z?96=G-@XLoNA!#KJ@^WYY}PO?(J|l%fYo%;BdvS>E)RL{`sFL>8a*7XkH9y9EI8b_ z=@}q#uDOFsCk@3@7~+EJKQ2nL7n3MGa5_rvhm+E8Dn1wx^*qi#_G1aeD4` zWQ44o5irzZrYr$ce_I!Rx${+LL7s_8otS6Q8=^&@m4}9RR~0V}5Whp)JO_|>a$Qw& zJr_)ZLJ~CB<7M5W&N-n)0No(kB55;DP?65Z)*u(psSBWPNU0^A{*=cH@p;qAT2H(P z%$s(V)~;k1V~Awa>cK(EME|pHMTup;z(h#fTAVr5WhkxZ^2=!yF4G_`7j#^6q;;*r zMexhTvC*qa@N1AIc&jJ$2|&*0_VNUxQb1(R>6k*{S|tw7i;_}XG&jIWYj%tsJqpss zJYFk%6x&UCgeWa4Np1-4G{^Gv_|I*?YjNT>`=#r43y-()?pjO)OLwBNh-PY(+tQ~v zI#%k>b#N5_7SD$dLR%iY9<3CD^kVJ&NF-5<~mt1x!OHeE$?F|Sampy*6;`FP6hx&U~43Eg-!*EweLNk-YBM80G=l71Xd>jOmw^C=?*?^DsNf;_U z@ffTIhv#ZO{6v1tmIgmQ2`AU)UJH+F6;JidRZdN>U5QU)NZEjH^~Cj}7F8R+nhsSe z(6fH1wp@G1C=x--F8=uDJLnNmn!j(krA=9_vc7;$*=!N7oW6)i%b>SA6vc;|tY7M% z)}Ac#GL%-kdKs-)>_d`zrBCX>=yb{L5Tf#2kq%ySon$TO7+9I#B>Q^qSXtS9!!@A} zw&?<00*p{-gp<|H-pb9xqrQZPb2Hl$?VC0MssaAzwJZ5IRVnZ0?$zs;(;-|Ztxetb zpZD{;;l^i+xgs}jt_8gh@{@>OZRzMkFb#yi`>%MoNc*GQV%%%HEI$S#TEe01j@XMw z9-p8U|3Jk&_|$cp;mWo3lFyQY_YY2Lm7n)NGu0aXzJMbww!&u^@BY5|XJAtd6Q$vC zW7=1kbVx4CwPackf88ghInly@2d4yNQ;y|I@Ozw=;{6MMO|O7dqduVIvAh0`Ap87r1EK|AP}7J*9rtS`;3?J+nPS6ydP{Vj7H@g07#qLQ2k z=Z#!QYc=`&-2CIE|B59dKeFEI-`jM;3lZH5r-ny{Yr+4b+&{SLN_s=CuO6apUXtsd z{A3x0!8JiF{~?s#Wte=CDbXu6box>3sVn@V#0yWAg0~7FofZ3eLr@r?$o`7Ws&OqS zGjHGZ_FpWTz^@3UB@T>L;APEVpr@=a3uLGi8<@^@ep5Td`-UD5 zwq>A^9y>BeYZ-;5Khm~=jRQ%4>$LS>!kNIxTI~^co`Z62?{7JLk2i96oJtCB|7Cfp z?*h^^C>J_UIY`amh}1l|ZJM0z?!URRyE}l>tZLo^B=1I0V%P_hl9R$q6+QsT$}0OX zVp6y<1ewyJ$|QJG?jIc6J?{f_GnB&EMmzSZO1K5Hm^m^3Iae__^miTpAG;+EOlrfI zq?*L-h%`tui;=c;en+3u>N3aZtd_aovuDNcj1&^EYVu;hUBR9rsjG|Nx)<{ z+|?cLtDyHHGnN`jt=5)iV5tL}Z0!^c)F5&{BBim3d73W{2|j=$YRy5|O%{j#mJ##Y zl0RJmm(r9avh5Ri7?lHE4*#7OW@}veg8aG$@kjg`Ir1`T>g7R{$JlQ z3oExs!3XZK+%r4vo+n%<>vLnxs`4a4qoZ-{raAyAM{WO$b@S;wefH1JraZVNL-UQw z{Qh6QFdoR9>ir_N!JGQG1*KSkh zH}kwKkC}wG-CuA0KwdqoT!QChkXj=e%ee{AN45=uJ-Bp80=KCa$(iKTYLDV zziVB&Lx0+*Gc3qUTl)J96+v?_(@sB!Q#I7j(lEb3D6KEH1il2i(iZ$GzX{0TWA%0a zBs*ehVZY0-S2Vl9?}5B$ujvXsp}(86XVVoBUEhjf>gmI#(ck@Wo0uX-#qSa6Bm?&O z`))8vyuH{CY6nm)RvwAbFzXY~WN{Rv1OoB9`DnoNx%NgWTWn5zeuGeNjcZmfB2AKT#tC!$i&QAhvKA7IHDh$f=qo!s9rWa3DP z2%e__wMKYq|NGp5gu!gGHc#{1~Qe;1E$+C)-nOC2KBN=-M+UAmTK_y>P!628pPIX!M+FeT* z|3!_N0G*fFQOO&uOdc2Z&(L(#}B zg76JpJxI|2m3nI7VU`Kk29O=v*nrEVJ+W|betc5e8&FAX`8M_4yrVYaBJ+v18);Mj z*yJuARyCdsp(3;a8*hD1H<3bIsDRpfc|YWxiNHY!REcfAMBoJ z(VDNrMLYu4K3#hH^_b+u$C3bVfb-hz^J3w$O(43D&BL0OEx%M3mTv$1*o^vBpE}o( zqvx_yK*@-4&WP>&Q?)&T)u6b08jQCCs*eLENl-?>ypCz3RHrA;0bAIN`zi?bKEKGrp0P6I9snX4(E{|08nkW< zCy@BoHYKgW=O$b=^jbwwkvD_MYq_*Z=N5!EaQ2V?J9>%h8eAy)ez^^bR9CdY=yo`A zzh;toMDFkCnVRg@fORKEYR#G9F4wY4Bb1H11MG;G&wGHRyrBgGy%$c(!|l`?ibsFP zcnd0jKO8wZdS)-hl-kkXGSYhagRs{*^qLSK0+OPbe?l5EPhisJ2CrU0!RT+`D8d_> zeF77u!ZNea=MKE&WJlS^gRQx!gvY?-WNhEWaNDc;1d65MC;z`cCJR5LDGk=pWz62{ zMNeTY@od1h7RBi~fbdp%vMv5S4=dQ3T4vW! zUkKFPR@7lwFM_JnKvZU?mmrysr&Bf@y^Kn+wWCBnZAOPzl8${y&F$n6m`v02eq-x9 z?eH~RGR@nj+^t_n$aEclOAP^1$wKaH8gIf~w|zv{_xxKJxrqm`)tW!2fS{7(=>BG( zd?!Q&Sj^D>E<%CqlP_nb_n>4eXK~e9%lmyA2d10&ev3$zG0&P*=#(Gy&t=L`y8eea zDc54MG5MqZ4&VIQnP%<%xaVr%pmqKk`vp$ob=;&3w%u%>qEd2ajvDPhgH!T73y-&M z|2c-Bj9(lLz5t@Hl}T1jRfG5v8A0&JsA1tNghV%ujVps@`E_n+7CcdG=_Cl*SMgi? z{Cgy=K-}OzdM7xrqka-axmS$SX&ce#H zIj|PE6i}bdkEHg9ofE9MLNU`q&h0U8dbwQ&rxDY~9MK|==b;p_J`(kbppqa`C9-5^ zQE5;YB%S*Hx0vu3{&3GfR%O(Qb{FBguK5z{9qY-(JyW@<$VZo8Qlzqlw>UFK@=LLb zG&@=P;blNKM%+q?Yu30GeL}PwVQT3mJ#+Z4WxzPF21D|jd_1MH7D~nK_GRL zYmwe*hl9qp1*n#hCrOSOu8^yO!9t1o2$rvrGNJYl%OrQ0_BH zL$g{wxN_Y(`UfI;yS(kB-VMu}qZR4^$!eK1ARIh zEy~h8U@zTM2ArE*OLKUHTlReO4Z0blJ1xDD!jT(S4#>={u8G;c@^k4MKytQXdapO! z-ecQ&qR~R+u8gQh_aiK-SzYEh3+DO|CM~%nH%|xYpV~RU z-2p2K@koEq@+=-6L_`V`!=r}XN8yxm&^lVl;V}%MagrdUCjxTp>61WSIOK(HeX1vR zPMa9fGY}C#J8-B*&u35>SqDl#eHKi0>khl>v%9stC7wekTa;eC=6Sfw*4PxKjpBhJ z=gIPDd(m~$9G>z4d?`1VwaFa6_F2A+A!E&Q4tk}(eb0!Y=1_hVthqkbujOI4ZZywo zE}a+xm~V5+H-ML3N$D_CQ-y zX79kI3==hTc`xuPC)Aa_;eC*(Yqyon{KWWMMA!Cy*mC>EoaEH#Hwv`@Cv8_&GK;HZoiX z4}1X=`6ZVwJ)5fkWqxf3oz~r7!Ln~aj;mI4f&3bo4Am-EOZ9iTPi@#Fp$WGrAiqbZ z&1~}d6vQSmCci-^2coKZ$G32r=iu(Kn(}{#P(uvTtRJ~vUEcOb{4cG1zN~HW|47i_ zan;on43`mG9r2kJ*rRb?qiyXNo6wq$fu*{=RxXZptpbJxa-~t^MtP?yYwE}6$Eg-l z@dSW)DC#w6oY>zwvcI(Ili)P>y$g-U6R%*a2UgJ~T<0C@Yr3!$jA+fo zYNmeU)mSu?4uOeBA*38fAsoEi67bUGC)JBF|RokX37og;% z`SD)pI$e47da6W^=4m_^Luzh4v*#r)Nhmv#F9nj_3do~sE<;4VYsWZ+5X*|5@u6>S zbE`1PZ5frFHA{0qbv78Q+fl)FuC?eiylvCvih2bsyA!2Y54g^oR$AJwg`N$Vh`ZF0 zTKi)hxQiyyr?RPyyPJ|qJBEA6b$WY!6T1Otb8eWf5%5~qO4xc+J6LWDBDp8X@(ki@ zfm*DiGTqlV|9V`qZa>#knnO?-dhVE>FJN&Jt(UGM02GCzhPS;m6ecJ~C7@6pP3JS_#Aw4TK3=G%1hr*b=4 z{fnj4ji7eY4NU_|;@nt|b1S>H*;$io#w(K}v%6xKJtRq>1Y z(J6?vL+PcUve&BqmxDO7mBe0wQWqu|b3*r2Q$2)?tTzrf1Ji4WG|TpRd>u@C<1oMu zB(HCvv^ROvz2EGAH{#Fv5VB^Vh`;pv`IEg$b z^lIKya`_~X#G0n@DVWM}ZfFbY@)<^M_wN)Dq2=~-gyd>R(7&WlK*>*4NmGHpM0KNL z+Z(>8MwhRUX{vOi_UZ~O*q4pb;Qd%s_L*FJ1713gp zeFsYaXJ^+O;(I`X)Zbkk*jxOpcNc3~9_s^eBqBAvvV!*=)kEJm_6jgXFx^{Rw>T#E zI5&=+gHi#e@yyw^rfnXFNHGNLYx}m2$H;R#ptJ5cAz0K4xqwfEGZrl0(IUO5kdtr< zvU;kj_T&J&!)5uDJY7oHj*44+Dx4y;&{C&?d3^=ysO8Pm`v*2P+uIpE-pI(u9ivZR z`y9WUGeLzmc}-L*ZlVS09NX=PjbB(5mOgX{VvgzP8s>*0964UcNZY|qw#%d&Apf8$nu z$kd5oB{V3J`Y4=t?^ew6;2!Uv7!~}S{s5=>i%(ujZtrhcK3dtI>TfWIu0FU6p^6uN zRrAdmp!*CyV5t9hWwDsWk;nFhUve_GDgq`C2khaiD&B*Mghr?5%_#Q;SH-rSEeDc7 zJ7Dr3=mvp046`_U1gPE@{!YWa*>zXGz1)+feM^7yrdpf370j!~7oJzT+w${F7q0Gh zuv&z-v0hLws>}U|Zg8@WSb?1h_klb$Q`^g`Sw09yzO}_?xGhLMgwra;;F}islck3( z0+l9sWjP~1(!-7IL{w0@=F6e|QP*nE60X{N3{d=T`_B_lC#&XW^CX;3B{D$S8|j4} zdJ64b&7Fy&@-#?LQ4(um>lsA1eWqLQ{~%Pgdjqd^3swhImBxy_SAGN$ z!B3V@2l{&jJ%6!n5c~v{0tofe%;2A5=etlLu! z@ZP!6RGL`6gpz6tqlEdYXKF@Vx%?VK6mxsL;&*vkA!;J^d&mo?_Z<57)-ZsGa9~We zGAZO$%bC9qpW5_6A%PWp7P2w+iqRbJEi}<4mzU^z+}fB!q(t=vB>3t=JwN3s=V}wGjgYK zjSA4;*<$@xLC-`f`qny~WrOJ_D+4B3DT5RxpvR;0HI|zbZkk{>C2RJ==pv>IFB$`kT3M5H7ZPON1sP#Qs<%;O`m3QBTo+m4tuuowUA zx^u5%UFdGLkVYg|KuN21EVUB{22hcT8AqFN5Y>h}Rmc1qJvKtg(h!#qKW{>$+0l{O z=rDxw&N(_KjR5G%_jk4Gcx{g-+Sy%O;LOW5O^@~=vaZ9b38%O3E+>xb!S1CrcCx<# zk(X-Mzp8XcFvM{6T(g3WLf%@ZtoV-51Yj*~ssegXxK6#T<61CX$GA2HYCArPN57Vs zAEx}EJa7Huf6=RV1xz=)+zcSu;Tc()bsgE2TPlB#>Jht<%3()p8wvM-5yqdi59vNQ zO@*zCjjCGx0bFOcPyCq{XzxaBH()Iq7g~qD2`>lN|J1zZX4etQPP%iM6K=tzidSB} zY2d1jLzD!RPPKL|{h+^pZa3vbhrJz>G6?Kp;Atc29mr(J)}i?v0ZwPYrb?^~m5bJ0 zJ>lrwvR&QXKexCJRNuvWuzAJgt`_~R2C{pRk;_WUAM^(}`4-t^YJcs?xF1!Ll&|u6 zcp%V6r#a0&2ol5kJ!YQ|xmI1foBH{1e%M+OCmwMf`TXm79~HMA>=~XXElSd(m}JjR zjj4slWs&O8llIG1fR1w1cmgTU3y;;21a5k=PitGLt+M=7?l7Ee6n294(BCmN#$+mu zXE4gS&Xx-w-qs4AMQO(#TKG#E0~EDHeeRko>st*}&!eljwO{B3*90RRU<-=+BAD7L zUF7YbXH0qt+clr=^f&$HA7~T!2!5cgLB0~amr$F(^C37Dv9au*HNw2sQ-cf)O*4NT zl{^m&&@Uh&WDaT0a26YBPH!URvc&)c-vYY|UwEAG4#R0pBUbXr?wuq;3=XUClC~cD zZcok_+-d`%N8dwAnQN{+&}&OxAbIF8rGRCL9sX&5xK;$#u? zkl@j9m(Q}j{%78^2F_!!sbbxuD(=`k##-SEYpgsDR*@=dXkd`e5vcvPN&N|+5(r>b zwRj?w66%fZ!15oeSH!xxsvF6<>6^2pr+8t+DkyFy$}*iE1q8=K=Y!b3uw1^mokJB3>&)7v>IM z=~`f>hQP|ULWn&o=f#jA<*GGn=pe3>;cibKO(YLXo^QO{FN2exw((&FC@o`HS!GuB zH`9PLm^E;u; zMH@OdB4t7zSFNX)KwWQov8HN6aB{rqYQyGcP>UYkQ7!t~fURS-(!2#o7AUdV+}cJ7Js9bi_EBB;h9r*VCtT^w)U-TMx3*bQ2~S)z-yykkOk_kvgcI+ybZSb$Z6x z_Pw~ZziD+dOWp?OU6y2;$@+E-5g9vv^Y(3w6<|JJ)mG}`nd$kRxozu|$)R-{E$-et=P?^df-~pFOu(qVq zhz)-a;#4k~lq&NOq^5=|xbefTm4e;0Z7zBQOeuW5+R@yat@ID#OJQljKpq8>JuO>b zdMsGj4jaeZ`~)ohma!*Yr@R$)TopalKYaMn`szLn%Yb&(zUI#aYB>WH`SV#Yb)%a) zhb}e>Wa>hY@YZ;q&r?jG&8YSQn3oC&TTb6E_D?u!rD=XIVbU&E?cc?_(`Wf*6d|qa ztQJ8+wNxz~f)r`2g0H;h+UsdYi<|vl$3z7ZC0pz0j+%cXNp!xzZ$e!kvZGBkyoDgf zH8zJ>hM{2O27PYI5oK<82hnx1g~zQAyXRe0MkLc$+msTb={;=v?uJd&3n2=};uwnF!PO+!f^1r}DR1W&+;`J-)m;H^0|Mu-WYk~MHOlp4Z4@VV! zjm!(1$L;SxwW8B9W#p$i(4u0Ce-k*z#95;J7K|u4^v7E9T~F1LQ9G~pdzAFfzV2-5 z#Q(NfT{F{y{8K3qop*g5izJ{+&8VxCNBJeWT0TPFNB2)oEIe1Hl4E+phpkKn$M(ed z_I47@afrnssOj$2=8i|I7jqZ2a6jeU&VM@Ao2aiqD%_5RUsw)towqe7FGb(bC$Y*H$LIljyM>XRHF2O^ zAV{^oM#2r3^@M`aCnG>p>(84pUeZD=UF%w|SY;;GS76vJ!861A}r>Po5mS>gr9j3_ug6 zQ`PFt0VuVyf}ZD4i(28l5t#xwT(xwLn-EExs6BNvEOU-@8g22EC>R*23iDMq?^}av zvxeku%iX%kiWcgSfdi8l*mhngJKTYYgvNGlcfuYC-PzxB1?|w#dlw>^pgMcW-NCgc z){GDLz-j%LQ-T->ng#M+EU6g}J#rtQF3nNf;BbFZkg<`4=QN!MV1?l4QEsdcLaEx( zni^#t1eaPluy^v%&wVK$?wKoVy-sl-!7L7RY?Lf1GJg=6%u$hS;xa;O(2rtM?Dgeq zPy@wdDEXxa`27h$%l+{U%NRCXCp+WvXIVtWK9%2{O-sq>9^6&Opg#j7pI*oDj+T*I zZ&-E%bB61PUrMzf>G=?~E5n%aYt{@zWNj~l#`Geb^agF+sxx`>h3M zr*Pzc2Ke1IQhwUs-=rqc1-%2OH~(|n2|jnJmA?85AxkJ5P;xWS90R`W@s(}m{wp{i z87_dMcJbuwjjxl4VXS)S?;si0%3BzK-}h+@(pf1Qa0(#GAQ$IaeEb#{xtaZCo%%bJ zGR<3swRQret}4!XSwW9@f03D)&PMYiG3sHi`~i+~ExlUwtC6@G-O;Gj+J>scV|sYy zr0#hvNDRZ<_SaJBaZrlM4`gC;54El~MjnqQ9cvG@4VsDpd*i|{_1qI(Cx@GC@F;!b zBup0s?(g)2|7uJ5+xM5*^kjD^;(@WM*HZxbt0~LdKevq1r?<^|SdAd3Vbc6rGK?xa z+cfKRY$R7LbgaG@XY`aA)$IQAwq1fE^U@4F8KdNxIHg&sRk!GsXTd4KO2KGK=~$YC zu3`b3-4m7N98@x4FrMBy=df1EpNma58El$i8~2y>r8NwFBmmY#K`n>(; zvHG!f^ob0#jWq+X9Iv}-!ypwTV1KhDZ|v_JwCF?q=&{k-STY1_T_d&9xY>0ip$eN3 z>)M|2=BLY9WeX+)q4rU(h1Vg{neE6_OV`8c0qa(*r+f524^d_*A3c^Gt$j4NS8FZ{ z#5gDms(tZnfr;EeJ{v0k+k1Frg*%(d{eCUGZgq%VnDnZ2a;E3{!p)%7d0F9&1hY^w zJ>I^!yWy0|qCks~5u!d?QL^?SBAY=+Mc3Tr0LJik!yi&UT&8>Y5ID_UGr0+s@rTbi zqkb*o-;7I>&`<4(xA5m6K8grFxp8QYPI@bxcdfEjyOt4d!&EJmG>T`GW z_wK9!M&5EKX0e8vug6^oH8L*-U-h@U`zQ9!9oSz&zXu~lLafjJy+O-YEFWA={eZlA zd)3wbu50jMS}D8X17NZphl>Z{^b7?ao+`O{-9uPS^YA}Zbe4znNSPd3tvmupaDyTz zFyzrXQ0e96^+WlpFZZKKWNX{l{1_m+>o%-feJy<>s60j8_$1Wjp*%6DdUY1!Q@N*3 z=JfG-?N<$U`Ll=ix%CI`=g$~$n?HFOTw$z4V87^~^r^Ldsny!zz}Q1_9UpLH@w zRs4&%>Z=vVhQXNDfso=KoV|?F;W{;YO}DQ13MR!Z!*Ca;=@2rKy}WHPd=1Wf_zW|) z)O%jXCEt6>!2Cx4i1i>pzX=n&&6mJi{k@l$YtUg(%N;crc*ms-F*__;*xv2$y|TuK z_rPRjd-GL(AC~Ugrknn@|8Y6{4YdO98u+uU%YWAPiy0k1#3!#Ct~Mro)IT*mMm)AvpPM4LJ`+u*mj zWV+2K(^02>BM7R20c4nJ)J?@_pJrtOGd9V$KM zXtcuETvN+M$3U_*GGE&CvH8h~Z)H9`4vI`hhDU2b`FKQX(()IvoB;nY8l}2wIsC-j z$3z)7^OIojsMA;|6MZ6=x8eMh9$V5gl$;7z_88&F`ZOqcSjk7xxO@6PxxL1sGvLU+ z3||$ej23}QkMM=rQ$c*)cqUT1=BrhsXF=UJxWY3zp`e2BEXAe@47qLn`5a7=XRxj% zDt2A;dseJr&d8k$|Fb+@&6$J(=W&2Gyqv%_@9i56FoC%y{BZp z9pj}#!)1+}0y-bO{?zXJy3r#1lG@}pAmE#YRQJzd^TJ)j2aKMxa3r~@7Dsl2$>8jm zQZQolgt>BQA^Q-Kb!{N6HFZElPQQI@)1KuY>*T5TNDUPL4_Ch0z$ z_1S+3m)EVYVe(}#ubUjL@%0s01vlf=A=h1#E>*d<+bGRzSfkm$Jex-0l4$5{rB4Km z2{cQY03ul?%;vB67Dnc(Rt(UF`ojnr@l|`i9ikq%@VJ?ex4fGKN<%F}lCJX}Cb_AN zAk*8j!TNn{^1pU?N|pW=j(Eq%+OpgSJ)wiu^z1`KvQ`lp%7y+TOtlz+Wk}%T{`cDB zRn7MkOe<#X{rMC~HV!}Je~wLNHv9HT`wIl=uCJEzrE729 z#=ji;s=syJ+;GJ^{~DoSWyfw)f#1Ph!&>p?=kU1b41+iFfSK+Lpews@!Q3uchu1$M}pPwmz>Lpk?XV1 z{xR!?>xkbxN$Wi(kF~i9^Z2p-&&+djcpNN;wJN6f91kTsI~co53MXLF{fuO#+kFQLTza|BOYq;75P5;)aRUIWdPcPT0ZPFaZ=pK5A;8B(%MEj24$ z5lCrRRjqRss6>Cz{n0)UNtWS)Uxw_i%_E$SY!IV%fDwwh7(4*ytyfL$-)W0Ze!zBv*SQYwX#&^ z`Ca>!Y9jaVic|LX{uXC9Oo?+|iA*7*@vO5Hl8{g=b#@Pd-HITv>5|HAu2ZeP@n!1}NrvqLON`poR>R93J^BNm zpF5#sbjAGWD1G=YM9N+hgZ9e1`+HWi%^Mx=L8QJ{v?yrz_Lyx^Vef-ew!vz|_k&52 z;o7&$sPF(PS=vd3docGe{L(fPUFjh(J#>}myJh!(7!#GTmNX75Jm>SkgGGrS$)n2V z96-7U7%}qE)VoW`eH0fla%ovy=YI^Da`TC(SR7B}$^O$*t|m|7QqF7FT?;>zd%SFQ z=Wsw5!4dyRomKygYYlmu7C{LC@`eg3LMqSoPy34QXN=KKp2u~PZ0-jyz$91R6xFp~ z1S6))H&Q!-xH;6I{xU?WE$W!rXKmYm1(nohUi}+oIs|uZtc3m=7`a|Gx?{Ma{J-Ac zuzpUGZ-8B67;Eo%vwxzFGAWzbTbR`4#>L}_4);&-ek%q19YiXjbZy@lZ+aJ}5?TbI zKP!KTTKRpif6_tBHO#+{NC7rkKA?%fc^hBZ>?{5t38@2-y6ntpU)m4*gr?^fe$9aL z5h6vczF))O$Nh7Pve>deK}JBjIlTl_e@16{`3#6KINX)#@pDA?;i!|^r1=FV1+C4N zWwxM_;HY++uBMOl4_w1;P&1LQF)2^K4gGhyNsZSUXtlcEV`NJI7PkN5H0Cj zRNh-BA<8ZRe}|LZHU<8x3S9hse^Xh+%0Yuh@_#IrcW;?gjszoa#nDlZ%FpwT)~+$> zXfP76x4di}#~`%zrDt1QI<|jyO`Fgj*Wb(;xw_r){SC_}XT9u%{)W}#ZBlq*Pl&Qp zD?2Ck#Nh6QgBt9~2$Gnbt8fsM642`xemO%Oaox3yJ<}xb?$c1|=glbHDI-YLv1$WN z#AQlbr`$1lz-oKpmt|!=6UfN9vu%VdMV83}>y49h^HdPmQo8J?JT_dW*@$aQ?Fd{~ ztK#Z!8K`!$A43l)U(WOTCmhL6;{cQU9h0+Gpw!<5sANFkH5Ry9xMl0W%D`Odg%`O_ ztuJpYSM~eFxRklf-Q5n8OOOgSymh-bUkVsP79RJprfuY>$?Ka|v236=L=n9Ao*6rGEB%@`3No%=gjM#uk&RCRpE^bCa<^>i2^oLDw zRLEAO-I8h=!4O`thifCjX4gvd`rnjA@Y+5BhIKt*3!K+-?<#>@hp5g|@xpb2>%sKd zmA<<*INgvNj30Xz9;IS{-gIdx<)f~Ns{Dhrh;b-MvqF>scLI@0vxQu=Rr%kZC&sol zcfG0p2J4(tV~75dMFZ8f$qMnw+^!HaJvrHeEwrI_7Ma?$Ut)KBMt0+5ZfNal`h{zm zJ7D{nB=!Yar>a69fI2fyO_+rWZMx2l=sp(d)AWThPnC~<(+Y0#YhIXW$u)tz85ePt zCqq@^EtoX*9c87v6;v6NDZkwY5J)rM)*`6E>UI>Nt(>q;yB(i+2O=ptynMI}&39rH ziYFFjqq{Dbvb$ZUZOm=2Z9n(Goo%8L|9h`_`t94$@>52$ z&PV?lY-BdDZo|K(bHFM1nsWW1b0ECL39_X9JdnnCIk(-ZVVec5rsR$~+K#@_KYz8w zGsSobp|Ykch|$ZgldVCE1`ChX)35XoZ~&-sA3`Lb_Llkf$Fb)%tT+1%J>QINEsWvo zNG*Eb#O|F0`39t+*^nVVE#n6mN$)JHw!ZdOZXiHMQ8ImiUH8nkr*{yNTQS|vEO@v7 zd8M;Cr2ifw&1Tg2+CH@Jhw9Fq^$&#uB^NVB(i-wU0EusB+n@U(5Mi&HEVcHd{5UyR z9^)VPKh_+Ay#tQ079OAB#8CbIQ=G(`wdMs1MxJ-VDY<{gez`GS+scm}ATO_egEzXFvNR3dRSEQm#YS+Q4=sD$mSuKSS0yeiSk} zWOmyrk)siL8x?ZgW=4biF}SoPmR+Chv9_XeY=5T>Y1FpTtBwoj(dInS-g`W*TF|`@ zethrE{-c3lGL{Y04E{t=9`x_qX-KrAlMoT=j<$q#GMpwOR)S$wHcvsNS#GfLqgGo^ z?H^j%oWV}(AF9m`ZNcvJo)UW{Q>rsi)y_mpE+3{PAl>4pH&R4gYI5aeNdM@cq%p;$ zu@ohf4o0G8$33wdr&a^$F0tkRE`A*HR)B+kZg`fn=#Rz6_Ql>LtO}@OT^01TW3q97Ad@cNyf( z+<}dil#tvksqBGX1xLa&Vjfq%*I=|Sq2A~quG4OI&F`wSR<6j8|9t5ZI*99JbVg*X z=I5~in=0@N1(WzHzGF3G-h}CTM)QQN)y~7dK0|mla_wdcNG_*LRIha@E0a5X zA-1?qYZ;v0MTFPoww>I4uXjm2jsvTJ2RA@@>3~3&H4_}^8IdoVE;^bg%C}F6$DwMO zf6vs_mMSOu2SnIurr_X?O_fxPYViRO8BOaF}Vr#{iYDG%aXv1Glg6w}*W6R5sksof4mpsPzsk;OYO89nw2 zy_H~ncjB~$esuqr)D4s*$7`R)-LBICmY!oF=AQiYM?aU*URNz~lQ6dEr#8 zg?i!rm^8J^YjR&7qz96MPD8zL-L)zK-Y5W_!SW$|UV7!M5BVBcp2gKgB3I5m0VksphdV&(N=QLY7>mt@~m;f=e$(l*ZX_QySPOp ze*=|fu&NJ*@Fq^FMC406(6`{!W%)i;9EQWWqfY#!^fF(-aydG^rX6j|c7aH;qir-T zz4$#GNq3%l@Ar39e|@`o0RFb;xR;CUUEzs7z$x#6G8lg7I;(LXS1!H#wTN2EU6ZJp^GjP&O${;*c` zKbP}a85X|7q}R9oNOV5DqEY1D&Fcnf6RsmL^I}8Z0XSK`T9?ulZ|sS2pCzKYDNjt#&ImPN{Jt4Z zFWD%2rJ>x?KT@F`wB}n8-S_yJvVOGFPK>s<;Zq4477kJ@K*g&4?`0tF_Dek9yTlKlrNw(rD0*BXcPStoX(q(`6WywY$1=~ ztZY;-_ji?Vd)3P;nB+}f%l-5aA}?~_*jS5l_gbG4Np%4|U&p0lsMr))t(QpOz-ByK z_`BJ;e=4Cj{gTf9Lq)QHuwD3pEZ+X}xBsdf9S?)a&-Bg~Jb>QOv*zR? z<0e??d)Nrx_F95}AC3sh{eP?-T=ZL9WWTEV*ax5nv1<9BP&-_w9%`AfyyZUX6DX&v z98do-PCIGmchWohJ2u%IVKnsGPy2N2e{4bMKI@r^UR5gm=a_o$RYsXFT&qaSBpSn) z0c`Asv9AE{GE}X*nz8|ZD{Cyr+_>a?Nh5QDJFvaGoar|3E zSHc&xG~N6=OvEu*I}N_?pWrZ8-`OKRTztnhHPJ5(=}44p_|C+C`zSb>+$+mRLy20rxu_gCFR6&TRrX`SPTkgOd#kD6H!Ds zKc`Qg)ZcHYTwp;uxhI6fDI4l3h!lOV#eiz8r}j))b!t*^8m4;Gf$CAG1B$iqi#;`M zIRlF9g+4ZOT>>l8h_q@VdnTA7@eehdY^Im9kj9U=>_Kuisl(0Ku&TLNoSEOqx@>RG1G3%R;I9Rf!;N+!cO88T za#76@jx9bsUUnTmGOOc1cWArUW76i!LdPk>T{qxl$s5+xH({iQSJd*+D5&%ck7#J) zu2uF3^-n{afReg%@Y>ef?U)o^RGgeIJ%x&VSlT4FtAA+qwd)3l2I(A_WT~yai9JR4 zz-0{f&1-%uou0Vc4{0Vg`_%+;4l52tLH^pVjRQEjwxOv?dt?8=n(E9q z7^8(#UnrDv?o*-%Ff!K6fk&R(H@z$vtC%6b9G+}=~_Xu4Uh??9#A z>I9H(4e3s#ma(;YT;2sF-SKu9$K7!9;*-!^67T8nII!>wAHaM2J1Ur7^`iSQWU01) za2LEESIuvCjoA+XI{F_jW8MgF>?FpAfUe;)gs}M6qTR#D$ad{cD%vC9e{<}K+Q}w9 zx@|aYlWkHu=qHV2WZ0PTsB7<|>*}MA0Scur2*R!fJprdsjPq@S!jl-aGhYYAP)ML| zbg00z<_B#<=F@2DA2{?kRFXWvv$$y_P!i1kAvO+-_b>dSR-2#C&GJ0Bh8_Z?l(9uO znZ4MjPzK_fV!qTfgKeKVZ3N@}fBeUPOb>CLCP;F%sr*n+)LD^U`Wm7NJt7oQ)ynJr z4c0eHCwK#qCchYM*IEfqwkP+M7~Trl*7gP-22#Wsn`cT*done1J(cH|xWPpemp}TVrIqG|<Grs0U(>J8^VH<_`tE-L zq_8G}%|`GgM*nJ>Q>`IBL)kzj$4ky#Lfhz{$<&^{fsrXIx)cr9>PvZi(rXd#8$>rm zJIS<}biU0!nu+LQwYBg&gpyRqeyRqLMrl^Yt)zMf86WX~E!Kj=l=Y7zVbAa!)X|Ot z@=W<&_J<`Mjn>+`Wrt(>6n3_?*kgl*RFf}nI1WxZO*g3;U|OE-H?=->0wOO-kOYHe z>CPvjmHuaEUr*g|t*|ZVAKkj#LQh7?r?3+8c?uvyHTbWfXh4zvn%Oa<8AStAk}R9j z>3ML;#XlwVGXR+>o3Z5}0gIR9S07C|J2PM+3m`yR!ouT1b+?YbwC6fwn`zoPxG2!I z?q3T!wbAxmtQ?lnPG4FEr3DSo)&h1jE1uUs=%d(Nc&eA3k4<3?|A$)rrF&qyuIuPR zZR0|Ojx;neMp09R7r_zu<+Y%4F<7;gGkh+|%|vBkoH#CpiDv4MOkU>NyBv2fm+eHx z6_^xSNIY}!n(ME^CG!K*9I>p^sDU*-IX7$s%DdL~WDZVf`$ezl$<_PD9JAC6odZZC z`cGKKH@H->+uO%sBapJ1K-JK{36Tt4#n~vwb`0S%kPOUkRp8BVL}tO(cb6HXXUvY9 zH_H}`VsAYl&FfrueY_OY79r<)+>&GeTVk?{twQa!r2z9q3Xcg;TSvdaJ#xB^o=D*U zx>~VOh?qtZ$<@qksrut^a)wKe%=Mdiq3=^O=94b3OeynhH0>6j9?#k+?w$!$A)s)nmaP>b4Y1%WLa zTrHp<>~H0oo>jz0QSve2tNWPClwjkyDtZD;0SDW1?UQicJm%Q0TC#j956pJ<+o$^* z+UVe`@(d@B}$fHn6s;b)EZ%wcS*q<7|i2c@h_f5eda zu4{tUxSKcYdteG-5>SWPX>af2I^l~jLdSq*ves4n`T>;srFB)nGk)l$iGSEXH9T5p zHGULazBFxo{TP;swdF(liR;w%?)_@}Q*f~$W#Qb)wUKs#jWkB4#muEyAXG~Q_@-F^ zo%Cxn?9Jst^d&CRukTnjjW(u!h0fd7*EzLcgK0~vM@z@2UtqkcCQ#X6^m~Li(e@SO zoBsa8k7RWDHuvi^4%jGb;dedJJaFk4{WE*IkQ+yQ)P(4nYd2CZpon-&v#=foCyN6m zA9-_N# zlq2#p`o}9E9_0c~6_~f~t<~f+F{$j)$ph&n@xfKzY4gnBt*~@gFrpsK2@}`;k z07&Q}FnP6Z#|TJ|NCG0o0+#^Zqh1I9gc6ZwC&lQjkMw0AF~0iO^Lu>B7%kwWrExX| zXcZ<^>p%{Jj}+JZK+SjC8rGsBo!w5|7-6_Tbgln^MMZc3C-DiBMF!5Q*$tRhGd|rS zR%`^5)j1O11a)JJh!&;L8^kKMMO$7U@@v;9n4R*~*^FCkbKA;~l5w$D~+d%&bUGo84e`!Lde^{)u_fNPcB+(@cb-q_#meDmpYS-A<* z)gD)bd^v8$r6CZk!*L{Z3yN6A_B9jgtwChAft~U;C=zU5@3({A=|;w2xxWMUW&ML6 z(H?SV&FyQlc^BAe<#e~w)@wR=H#Tp#FurYimG9I&dCvQ;E=~PjFl9dcuyAsJN{hf` zjHu;Yszm?5nsVlN03>2_{%!~JgMA7^T`-S_FsbP)#>P2VH-G$xkpxfeHSRnDkn4EG z>phqsL#Qp9xm{E8Cv1XM222Gle|YCF5!@!%!)SFGpB#yge8x5NWgJ;5$26wz79|Ufs-= zGXEZ!Mr7oqTr2haxkK7!t2Bu|5m87@teZb4E^o4GS(`jQ?1|MRqY-?Bkm2|Y=ocTu zgv79FYX1okxvrj@8D97mZ345XO_T?|8Lhd{XGvtaC{CIrWdhc+EH=<3TgWqv z1eBVbo%gn{dU&8_&0qJhos0fEFs98F4g;JlU&nDPBJ31?h`hYfhqbNkp{2~Xsn~{Sr__1&`L1Pe;MSztBZTfg1_3P-;ieN^s zKy_pF)HaIX?g<0IiAZ@Jt^EuqxsEVb)b8Jt!DQ6P^R(UhCQGOEx19AyhUQbjNW=}s z-qU_~E<_O4&(7uKc9H88eW+~~xELlpPOSVksTzTYUo3kHT_aH6YQ7kkf!zRokHNrJ zt`(Rhx4wl^UIiyRmz%$}!%Ehml51OW?O(4tUyG}zHh2|1!!?=E(hL*>K+;M4D2cQwVQJ7pf+X+*O9@- znh4P*5XoK5+S*det*DHYgqtAnp1duoT)o~0xgHAO^sppq9_Hxb?0Co;Bg zVsfN22;PNNAK6cRcmGIj_@P?B-6-&)Pqrie=okGxD-&)DmVdL6Nv`Mee?Nkd_Onlx z2ipT65v}}mSR^cg>^f9b5t(~qAE>8q3JW zVFiELbtGZ!tBxIe1))H*`^%1e2w$fv!K=Ti`oocWdJOws6>8v5#6d>SV2qb7`tb4t_b zDGSateNFBRluD;q(w3G4jPUlSX#kOU^ElyCRSR-wVe@A5zh(+vib**KY#pL(!0LFA zy_ap}T&Ppk{>EG|QgH!k z9J$ptgyxX}NpWtJ(AKz49{x3pj6AIE2~qlLVtNHa)6f>S)XvTUFp6-uR9L(A{w<{g z-_;|PrH0Rqd9W19(t0+5$rEdh5pd|ABt-MQrNwu9mhyp5j(lS)-0Zc8)ZW`aH)Q?2 z^9IO#3pQ=y>RC>Ee`Mw5YZAPB0+mb&oL^h6n~7>WR^Cl?Yull|K7~+6gtz2kiU`#82;Y9qw*2d6 zkdg1kGQrLEaP@?m+YP2n#NF0a_h3?{%j@X$W`)^@Q-G^yYVWX*+JXM5IcCB+Z@Cc@ zahZ+9#B~!&-gL719N*kO!mo#OJH@1bq&7%45A9n~^18eo@$t`&xD8BI4Ke=yf?9&m zKJ|9fk~@H8bik&J1Y)=om*T8w&J*k&7>)7Of3Hd0?K<@^Ir{2f(MMo8{MpZb@>3I# zd;4ccXL+-!-uo~ql>J^J{;0b9lLP})%moj?a)ej`*gTDiG)o(H`!fjAo~pyEEW0?b)8i+=-3qi#RG33lI{9DP!$yoFTWW<+V7V|81D_urIeGiiK`stAtZ03DXsfG;!lnfxHVp(Kt`~Xfl7?LO3s{MzkWW}tLWw*7O zkCMpV>59nnF_3zwr9|V(CrM@Nbi!4Ym_J43TfoHMf=TIlZ3y_xFLB>7eC1TT_KPxo z?kD9i1*uuu7m&P)Get4cXTOBgTzp*qzCKW2VU*OQs0s0Fu=|SD_Pc*9!O=so$?eAS zucM0eM9VPi27*}kl`{FQYgy1F&ODb<1L|a9G2Jm>ZHsDdU)5BO|EE2=zVdryu9yDE zgGaeueCb$yj*mt}@M0xuYsbJ5eA|Ok&G*=z+9pz3Tb?-%rGRaI<*Ri(m{cn(qDg6%}Ke}A(~q!+TPBS5!wOgM=j=*fDL1Nz4%lhavQXUH97a{E#-oG z8aCf-af{TzuCj8|E6~ZqnlW)!XcY)~_@lGv6fR}=uz1>K067y(mNtx)fX?di-pPvG zwiHe;sRNP5b}u|kqrfG3!*KQSa}lZatu2ZpWdh?Z_O-OR_j&yj=G65~JHIE&A6xk@ zKqMc-Z3Xs1xLZIt*i0h7sSn57<@9I;{vy9hUJDSlo<;$tqQ)0q*gaA)axdwrYp9IP z;(aNKaP>bKxD3#S);L2*cU%GG?OU5`?qWX8nA2C&i$*JTbmmwq(VGWFZvrQ z7)E(V3}DEpeMI?aQZ4#h4nM3Qe@eB;Eef%7hgCMp1)_$Ct=r4_YN&s}_P;i9+nnDA ztu1-OwNMJp9fkiBA#OpXvUZH^H?J(!^g5i5;{-jLgzHXh(`0Z1m{ML=h-@Q zdNfaNx@sfSg=?~7g!97$pf&xg%NQ*3h-p}>mnkqAD7D9c?=8Dt}>$oG%VrgH3YU4n65fYF(NEq5E$IB9wt+dcgvWN3R@@m>B!q%=;X~JNZBdw4$)f3bERh8gGkEj8Fg#-;Jp~- z-tLGxg1j$?Q%k+|{ye_8EXHPm$OvH{g*k_P+XmzZv4nNbnN$Ur$)^*|X$xT5PbmcU zrBfJC#3kH`h!c- zsPRl*U+-_+HeDe#-+)uTt9N7{|C{|C)&YIi-wIw=p7rH7bGZNAvGUY_cY3^au3G84 zJznq3DJlf49QEn0^V8n%e?I&`nY?}rr{oih@#{W7`BJuJx$@Lx>Seb;rrFpV;RvYA za6iIo2?JLxUqzPyQV*BUlR^s_@Ch!G9u@JA06y(+qP2{P9jHj3;UWZq04(3q9x(Y> z&j~(ks$BMvFOc0hB(f7FqJNB5GPdv^DG`|Dpfl8}$FFhZzi@DC4OYK{WSGaqKH397 zH2W%?f#1H#Z^Gw$*|+&keJ}il-@fbN&NB9Y-^1n`al|K!1u*g%*(ng1nD$GVnf?_RNjeFBo@O{+HyQ5%3%*UaRrx9q6qe*!Af zGrYHWk|*|zEn8}f27&SVJ!N24nv?q*#`dwEXf>x`ymGJFlT`vJ8~RIuNMoqUQy>d9%e#ssp^(u(wULty~wNq<)3QO9&SR zIfGY5g3QE1*;@iCuwA255G*m^;uvSGMU% zdgF%v780uO+D1g`(J4zt=1p)qaa+giwD=)x1UopiiA@6RV!u4}_N_d=N}1~Ney!hB z7Db5wxAf1go73d41Czno`KkJ*T;KoRR+4W3Nx#lOCZQ21rJk}$vK%l*G5H{tw%DSa z*IUNXURlNvDg}@ZzVJ9#-Ik5*{d3muwKy1(UEQgA~XR&+HtB)_qiRd zai0CNW%Lf$DO7@r@HZ!nJF#-Y2x@x}wFAtnYwumHn%|8{sWW!l0j+eFFmf&OmhufM1d&x4oniHQ zVA}7>rcpiuOSbJFFfF8fAX2u@FHv|m3P^4<6tpP4bPz5>6D<{TU&wdV**P5Vr zL+A7TUA6b8#=IALhVS;&eqH^=o~d&Z%H;kMMqa$O`p(P!?{@3Ph7o4XzkkVzfzV> z>%YjubNgxkl!Y(B6u(*ODI17n#Oe)KPpSq+Mu>-^;X3bH-zJj34_3D9nD_=Ji1`*P z>f1c{M@w|q?{d9$!i4gB*Lj8MTp?F3{e)D}4D zL|6^*&Z#YyCk5I0SbLC84w{b#=qXTYJP~!Oc2C81QWnNaVNOG&;g%QsY_0s9j*3)< z=l3<2o-;6Mw?F;5ht*9<}4wzh& zcBB~RBGl|PW%^j=It4c5%FcOkvSXUhS(563$s2tAYopRF3>Ua0IYO9{ybzHAMdSR{ z+jp>D7>6&)Z8a>b@c)4pf=KGM+qOEzB^Wh1B1|7GB%mUEQbT~q>T(e>)jUN6Wd)9xq9~o1+g}f`0kfFh%&hBB+c#I?3!~i~J zT|dvRwzFE)hMw~ot?)TAvJscJteviS`kP>-$g=tnusDoYFIV`VQ$(=paj^CaDEGBs z-c*4w^A+2YR1QB^edD@-id0Ye2-wjamu>*0v~r^Mo6tSL)P=(c$7OgFLC89aJ%i0F zVH~5vCuR&26Ry*`uA%yG&o8h3g21O-Cn1VjGh!f8%DEQ!VFvbMCcrix&4LQerCYn% zo!>}wRr!DI0lQXR?qNi<4)f>puhR@S~R~AOid3q)0slugo=2q(Ab3b|5bJF z!Bt(^ofc~1q9{VSD2Jjb7sW6PAq>HU5QY#!XqqN8g9)K&nlKEe38pc@G=phEQ-ryC zfQ2lBOba1E$O6kk##jg;;OYq_1dY{QI%i1OMOMKD!R!1PB?DZhoTI!$z2h3ryMvIzZUZks;Rk|TUk6XbO1@E z^LjMTq`47Pf<`re#rKq1MX*6_(yM01O_}~T|Lu)b?T9vbn}r@OXG8V)JLV&&RG`Q! z-`~i0SlnAXl}L&#zD|&SO~6gEPHL)SzbstYpht#K~zYt!hcjsP@b_11A zS}=!;75xT_-Q}nEbe*{yRf|;jj!62uU2cVb6Ir#Jlm3mYTWo7Iv3hVLz|wH zs~!aEyXeP1S+bB7L;t_|s=516+5LWj<)XXm^#B3yZAxEU_SjJKSO?=!77Hlyw|)>4 z7e|q3%TE~=F;La}F^@U0T3}n%OMhRI1p|Wa4&xjyXs=~%ckN1{yXabjPapR0#23jo zbAZPPss)RNbf7#zQLp~9`&~PxUsexj8d0pi-rmA6vjy_A{@vgGB_B1Rjn7dudVjEJ z5!Wr)q{;83zn$=s4g7UA|Gll8!hXk=`Iof=LRKujD|g@13*KxtRkif{Omx6N6je<_ z%*DhjXzI-LH5Pu>GX0j&p#GiTeTN6EXz5@SD#njbq`w9_q`xd?d-u_j{VXWZ)B)}{ z!nljca-!diOLK4Xi(@o#Sii=qV(UBUYaFljYndxVEHBV#Ah(cy@yoOuMI>)ajN zk!*4}=Gtb~lXdRF)7O#IW2Zf>X10sGdNQKEG>Cbzy zrjS5|-LHF-x#uIOM(?a)A-I4|4m9s=mnj#rp_;tM?~w3CGfG^9rWXH*1}tVvF~?va z1h~rPLU9Q)g}QH3xU9aEO={tuDJI0Y{j*H}rxxiaPL{LDK{?&q$Hfaa=o)@2nuSBZ z-FxoLc9rb-6b4bey8nOa>*QSmXnY0sv9gYGIfV{Go<<||a}u%&-n4;63i-{wVHx_uklD)47(_}x=h z5C|GyZ}QoI^eNg7B%HAQop)*YPBt>Joc^dQPcu2k?&>dy8y_yk2M}ndT@_(1p;GvA zBJIl8KXEfz0{`t{TcP5f5OC!c^joawS`W(!wsrk5eM`qZC20ArsNb@?!o6O-k8K=k zKgd(emZE6Vq&HYg^lz?lrwS!(>bTuUnI2hel%lBp+=F%Zd|6K*smFep4w@q>gI{Aw z&z|M|dn{h^qc^|5eyPqR;uBz(){~VU!gmN{4 zV&bQ7aw($!-|Lw5+5Hp(9l~e)DEu@Vs@?aw2+1`FZM%08wz=&(wylZ02JFr&>QP)y z%h4xcU1&ETsfKT|;HS+O5IDp;|G0xg1lgcE_}r~`n%J@e%UP(95nE6+8u`scxBRbX zO@YQi=CYFK>L-g!mKKuixapr(ZAY?r#*l&CAt8MuA^p9jYY6s|j4PQN|LKP_-Deci zuPk5Z{|w5N?y<>ESbc{lcq)YQZXi*0en7!3M!MMQuR1Rgaccxc-W$UJVS&+a!LtwP zuN~e(QNVL$lYLYHRvPF!7*p-9(w`db=`W05U87&`v5jB8zx78<+uIkXf0gAv8hOc0 zXgnC^9?nibb^8EWHDkS#KKu4isQEvx|D}7EHLU;hQy**jvX-0gqbUF13D8+L{$r@B z)w_IWW3BsPmIM+$=8cTE*%L4HN+3G9U+C@+r4V%R1%J;?efD{XLickYl74&4hKA7M zxqrtci~il2%iiWRm(ND@@3@t3?(v~^))q)qMBiIxU7>P#Wy0l*QEaJCHgDOs$-T2O z8ihX516X6&$jsriQVXiFi1GrrDCwAS{bg+AGaD{w#`i1V=N_4R7iz-qm5to-pgj{& zXb;{R!&&4cw$xzvGYr-E*hX z)6wK)_fQtWHUohknB%VK&x9$PUv0n#v)E93=J3%278JkVPTwhm_2>MiMcN$;v=dF;CyGmTj%qk+x zPl^78`Bd}+2y`3`@lZiF}#*f{C142PJYvzR5(^6fF|}npsiwLncEJ z(|fyUF_WK&mG(7=!??-yy2QCmZ=%0sRul29@h;IqW-`d_LVCR^aSnI(MN62j?B6+@ z5=;CmCMeOr=R8<6mJ?vnf8$L9(IRFa(VsKxK(74^v6w_odlM!$Gq*_m3nmrOR3;PA zbY>D6x>dvziR9y_qWMf2`nQ{xpG)M_<%MV(lZ5DRIV~0aC#Ho&dL;dn=>T#yYhX5z zIE#6}M6Toy61qpM2J8Q1jUl4{O6nIaBj?M|y?H)N;!jBOqM0Ok(E?Js=s%OmA$RO^ zWO1Q8RUKtmO71p;i<}ICjD0@gHCFT=|0rvm{hRx(a9{r`bH zY^EG_iv9n6&h899;GE73m^`L~?Dh9#ThS_Vt^K=-Oe=95IaZ>(m^w$|PLiu=IeAs| z?>IdZ&F0JuqV?`q>lT8fzHs2zq)!?A2lA#2zTQK&G}{ZDdkMS2WinmC6Z zglWbH$e4y7lQIpDkTb#be&u9M$zPX|Hq8Qcr(rRPQ|KNx2^fA#>J+-)yvPS>#x8QF zME8td0hm6-G18~W^huH7ha^y8H~uLWetn1(YAU;x2;J8`OHKWVENY1Vj+ntGq)|iK zSs`>UO;j2lC6k)LDN?DhyMk0Iyoy_EOr_^f8J-}UngOvr8DUp&D|C2BCWw z`huxAB525NO<>w59JxhuH~X~-zs4t*h3-Qu?PfsZIt=NeYi3(WVl~7Oox&?r_=aIE zxz&(d*DdVE!8Z*VHMh)wKD;AzU+C!(wlmi639r+P`@*Y)*8@|jx+m-=Ai}~c}Th3!mcDYIq3JQRM-(H@&h%smla36g0It4XzHOE5nN(_W<+FC@1U zQ!jf4u+a>`X_Zv zrgp=GU*nI0k%_xT{LtW=;~8KE}Aa6haQS}NukSxR|$^g!VX5?3Ns)Jtu&Q3t^(6OXTQ~wuM#I~gk228wZd+~a-Fc9B(mNN z2<#2Quc*XE;WywWFkLXgvswJ;62)zi$ra{;t->w>Vw+hs)4=VLuVTDI_!UjvX)62e zGL=3IfbJ-{aIWNT4xTUUpe1{RT||6=u$?|E6kcVwB4IZL?=u5(PO-2D7acHLs##(N z)Us6g4el%h(@kY2KO*@GzArZm#z}=C9bYN@iu8O!*iOo*0@Fo&KvI_6Mb51;3kp6Z zyh?^YE&PfAtQCGk-(-YpM8ew4%w+Mx9YoSN zlXK~~@sh7FBqj*2bCij~FPSbT3GY(!WJBU|itsCTn+m#P=fcw@caR^a3wwx@8NzQE z+A~eXxwFic@R|*}A_6#ij`1EmI@kChPM;_9FR90TllNAUoh9ETUKa{G*m05YHUng_ z@G9uKuOd0EG?P5i*D9HOO>C|l&ErGjBu#11jNM8%OD26{{!3Zf^>oZjq}?4Kp8rOZ4)26!lwZldF{ z$+=9iPsFiO^rtepLN7lvlRUcOx#YWKuouGHoaw$4b~9kU6Lw*c)u+tOc*z#FlQ9Px z(%x5u_lTK6rn1{$&?TZHjFBPYSUHXwYI;7s^QzbjGLsF1@#WXY$lSskRB3p|f>K79 z1-(5|awLZ&^13*7kWL#V4&@Qbqs9JFlKL2!!n>&ASjmw9{XI_dZ|I%z!X9$V1YtX6 zOcZvJ5+{M_#7aC&7DsYOGgC~@p`)f|bQ0P$m_zokY?v;*!x3kgMJ{<}rsON&ER%DH z)7g@H7+-TT))X>V94n=n^TdxD@!@=#cT%kd!VY?Bp)4YK_-&CmoI@sFEcVVYdX|{} zKHa$#rnXl(=rYNVS{NM5C0_?uNcI{y=1OsRFLAmmqtlGl;z%A{w8l7>SXe6#HB$V# zj21L{9R7e=b(1(=Lc2GMBe~Rgi#VA>`CG+akci&~(~@?&X1nkk z{Jz618sVMd*f9phE}2{*RRx513BX)ggaeHCd~v*l0{6(|E8?XnTOzz>*D#_shvr2Wwgzl>mdsQ^)lyQ)JavG*CU*eZqGtXgU)=9ogf~uDs z4dSl`aX3&!J~u8nNj?{c8VS87m=fhZ1j?`FUE=C6oWbf~DK!DjI#$xEk|YV!sZ@+>!a+k4fi}6US-M zJ#i!lKi-!~Cxt%{UM(h{3$GCCVKdIfEzu0q{FqF-So=zN#St(8E z4LBnk=1A?7K2WmX$QXP@@?APDM7<{Y z26<|@u!9PY5PpLvM#?zi;_3AaGeSm*LwR)QXqk5~ipNOys+fVsisL18;y7`5FOD8B z^KMF?kTGZIOf-`{WcW!k>0-~xW)dV?r-(y&#Nt$Otbk6RCiZL5r)TWRf-_*}M7nFH z@GhM`OBRtFN}MhE4y&IzlD!};nk$Y5$^7%g{sBBNU*>l)SzvNGV_~7>E)w7($zIT< zb(0ybOC*PL&Xd(MCTB_OGWn7MmzzmR4ryH+aSODS8Ky3)WZq77R|{{`%r$1=md0x( z-@&2lgjWg5^@c?M1~bkjiER{zb8yfmnRK5fp=V@Fwuob;#M4%>7sSomWZq8t*)BQx zc^Mg99BO`_j4qQd2Es1k*UU5lSwwQ-T(Muyf%3(%ax&*0nO`9i3M7Z~S)dh)Be{&5 zBI6ulai7@VP2G#dp(Z?fz&MW@l)!|-mkg~^$zCHzDiizfGyIN-W9P_AAMEW|4I~H5Pn6)gR%&h5DHD=NFFY2F};93YLj_8A#z!A+?`*wi$jg{QHMB^Lz}P3 z{2QX^dPXK)bc(%CF~1>`ZgNhS9%O9D7@pYslnfIV$Ii3*ie_v|$?D?pUYwH2>wp&1!+gtw+g%( zayrOjHIj!X2Z|pB@$f4$>BJj@%p@1{!D9bIN*N+fd`gNRD)y@CN$Y0s((tehWB;1i zZ)DOMZrsA5MuzAo9TB0Y_g+(ZA37G5Dv$H>C3rFml|cOs9I9LZ(J z@#64a>N3IfJp4OR>;;LjNoG@kvnLz7%Mw$}riwu}RUALfaGqv*1HCXEroKJYV20#y zX+D`%kJz_6N z5-AXe1Js~U9Ld24MP^^f;NNGwk3KDisX-@^e?W4)wU*>=CbcxGR2<1AddkGH0%~?d z914=6%f(&;JyIc#Hep{W4wn+OC(J&N;Zc=gBDflMRiM{u%)EdoI3@WG)Bb76{(efY z%`oGw&UhbVxLzD>Vm4?Hhf5hx7sT;mqA4g&yib=jnf)b#qQ$rwN41H)AboNfre=5P z{C3G9cdFE3CROOy#Niy42iIkOgIabuC*n{Oo$%D`gRC>2nSCxEdoB*=P|gdn ze~ujS()0j9|6RtONY8qh&g-!+=#Xr2xRl5p2vgx6hTtoby&$7>kl5dY_Xf-43bh|1 z{0)^EDvNLlG5M-E+Dc*?CiaU9$nfG&D_uEU9Lb?kBgElS76v2D<`R{E-Sk4%5u?P9 zTB*%wvuP%+je(tq>A$g({iB$S6Gxi}%JJgZUR*yRqqFNoapE`~HA(CRNl%l-egXSV z5r=a~VN+oab(83uCTyp~>1Mr~Of@6JlsHo+J;<{pC*Q|qvn6+(B*jbis_2io;z%Bm zGSBoNGsS#yIH#E$pRu8H7K)R3WU58t*oUN*#b$qqa+Zkwi}+)y=>?yW=f#Ntow;0` zbax|G80WQ;=*>Q#fL>*M_zN<<@j;rhMjXk(X015nu0*U8$Ig=d*2DPhF5_r}@Cu8m zjlzGWGMi-VHB#Vav0uOl++uny@v&9ro%m^+nVcccw~JGS9Dj#6+KTy3u~!Z65=Y$C z^ME+yE_CF=)awdv%9k82U>xrer(7m35PQ{hW}!G5q#Z>tyM9Sh*(bbBL=?*+Qd7`CHTrRHe|sg;#dXarCjWPNN%byn;_$*Qk*zMV^5geV3&znB=lCia4K{dLoK=aJ@R-bseJ zA-S)bzUdP8I`eLEtez^~6vs=b)-7?O@GMzg9BM5l&6~|xhSWWAxD=1wH+?Tr`2eO# zU3kC~w$le;S$JpYo2WRHhe=E(J**BAW^$g~(JKxgB(J873(+5nBLTwTv2k8Ld0rfM zrvp#Lej&m4%($Aqd2Tl4gw_k#1tKepmu9k`H1(b29$b_4D4pd~m3TW_9IK$h1C5W; zs8@{ZaKRw4zn>r;3{(0y$U`K@tMKYjvDZMdc~zV`Nacr#{iAK1uT{K!8KTN=lG5r!jF;?u=G8V=eS21?RixaifVS+f+ zibp1zevxrHDPxZtCX2%bjH4;yZx2$=RB^1FfiVrn4c+wBbd&eOGbG=pW-}!}YN1PJ ziTw&pW{X2jF4c=8xy0ODvuVbg^D;U%^TpAiOZDQ^2h?VvxUY!XEE0z<5k`x};lOcH zy*POkeW}^F;l4~9KT2;chh0o#zCv;&7hWm$o9Lib;^aY`wptu3qgiXr-pw{^#i3?0 z$U1Rv9aUQ|jy5q=H^3a~9<|vhx$ihF*(8qS(b~=80MfjD`RDX~x- z4zQ#sGCoX%>=XM{N67XtwdtTC2ZT3KOJosh#`C3S{mJL#dgG%<$@SuJNdviF?A6jc z6=t)anpK*9@_ll>IMm8;t`dg}NCef!2g!yt;@D}r^AzlAMqr*claEt<^{2TnEVqIe|xZuTrZ9`kx^RA<_q*Tn8V(o$jc_T;)Qm}S2${i zij86h|A0xf|k$duFN2*xefIHk(!g=cd?eAQo@I)ZllR-;wMG zTFCas`^gpe#GyQT_P*G2v&jQ-@-TC#XZm@39~Os8abVQ+TKXX-j=M!rLYz3oBCFSI z8tB86IOSFq4`HhI8xH$eax9-vcp?s$Gh&{aeweO(CJr^z!slYI;Vk(crkLAA&P&OW zT;%V>(I6u%>v1~a`^D5OTb%j?FANk%FC8P{!xVFc9vmb&)XZQTY}`TvhlrEMsPRy7 zB$va!Dvr9Y9VU(i>Ce~1aW|g~7kfcQItom?~>rpOK)IKxb;2!)x(LHcl(Om5K+voj_{)EsfDqL_Ryj+I;@-0CrcT`4Pq~d%>{9+lBxy8egWa$ zWHyb&W{c^Cw5m;@s!%+bxb2;e(sv#9mx{%lH!e-VrBVI`1+441s@79NtS<+&7yO1mgp7sI`%lFOKaa zjKbnnKItMV4)39VV&X^+4Nr*uOVq6wrm;7uPD*m}3##)_@=ey!k0r;B(o;{w@fwEk zQ*p0*kl>j(=`M*s7kjN_ofj~rUu6uuG`WFw#&=@hoy=$buXM(bmy_$V#r{V$aG*Gn z<1)T+F6;C`;zS+J87xj#(?3JR;lqsnp=O`Mcz6}23Oy7*OmfVfd%R}m1-r@k#wD1K z5c>rjcBD97P0+n=yq7*4C646M|D(N2;FHAhHoP-g?721D6mjA#PM#|E3u($UaoA10)5U%J@ze})?>WZCOmX58 z)t?1Zo!cal*^=W&kmnfhC8FkvLrp9{=ZTXCN&oZ3i4SPif{YDOw@~ag5HE{hitlE% zy4d7u(%}-ZzZ<75&FHLYmWgBTfraJbGExoI~fDpCC4hM@D4L+BS?3OQ~Qa7^sglGzjD z@LtA1ulU=;gk1`z?!Uuj4<#qd80C+}p#ZgiB90Z{vZvy30TK92+Ne(CPtqAbe275E7W*HylJSk}IP5FpWE;IO$n;`dJlMF3pdJEK zWIGNSDmhWZIDFMicC*SDCJws`ldp+G?*8HM4AYVkVy}krFw(e|n0{Rx$)Qt58M_m& z(Pq=gcpj6n$7XECzK)DndFglCW-ysIBK$S6IGr9v+E7a zr%FzJKr){ujyI!E7spD-+B3wVR=Rwq*#D5zfLY>b5S}eg)Da4E#NmU4{akS*kD)YA z9CCL+=ZoVdbjkv8^b)OFDE7ag%8SI|!!&WRICcT=FA>L2GsKp{G^LY>T_(Bj0-d#7 z9BQU-SBS#_I%1`9DNR`=_FDJScJYkUlstP@9a*mu3@Wdz9vakvDJ zZxs6%vDqX}x~p`X#i0P@Y!UaKXA<0M_O-Zqn>bd^47WX_Gm3YJLk&!(JH?S4>bXmt zs=%EAak7o!kt_Btav3OJ9NSM;+XLgO4tk|PvcH=u6dJoxRwPbU5+wU%ej9HVOOCcs z;sMED5y>TzJ@?pLspPBptW0twmryt&j=RT*%f+Ea4pbqIy6euB;&37DI03uJ#06Dm z-pm-NHj`44SB*ICE+(B4M;mF=X>sx>F0O?s<~F<5Ne)%8KByNb+yih8Vy~6_c0rsv zLhXZMzl8zOWHwbyZ!H-nVYQh}G5PSaICP2Hw~Ld3z2tjw?>QWGO&rN3Dz3vWU}<=# z#|X++~L9mK<^~XWTSyA;aDhdkwqE_@=vJdQ5kj`kpwIPfP9_ml2~6 z#L-6f^_0rFl-oH|CZJcL~X@zZ0;N%wr% z6LI_^t$Hf<3+c{h#vikEd2W1`dHRJoxtmP=QrvgOC46zhT?ESd-|39+HR7~vajb!U z7%28@>E~C(@k**c$ZXD0<-ulCMFJXP>@Lv`6^8;eF7B--5sVUt-6`_uj18l3j5ykiK2{v7!9C-|Nq5P8yxF@) zBPW=Cl$F#()2kUZlVGZIn|_`wIpQ7=m?DmSfj(92yE_BZOfO&*OgH|F7S1rPBrnZ` zDYApoXGxClBSvP6y&xuY#32__bH&jn4mD5g|C-R8FHYBAMJiLAsax?ZF1}z1ZJhNWK?)?#;3iapE|sx766(Ybt{| zQaefKh-ANjiML!FzsT6AkjZT_V5Q_xkPbf~j<%4%tHi!@N440if@{PP_w>#waV$s> zofb!%83MK9_+hwC?78Jiy|H^fvO(-Sw_Y&&&zKd08J#*biBpH@@fKtEbZwhB6rg7> zi^GT93SaD>p-(!D_Y(=%jGKtA>t=tLA>3(PKw7vVPSn!IE^+KAb?$~;zY&Z#C5KvR z)-B1M42L_Cqi&As5k~?pKF`j zs5_K1L~K!T$yC-g5mB|f!KTLAOJ)r)YIO>*)!^M6rAvMCdg-L#-IB|v|UpIE| zkc|>Y+vvE_;!q>uHAbA=OLfMI{oUA%6Nk$fx8q?quxRZB;g^l%ds&29n7SsJwR@;? zvN)bkP)`x}ecD38hbiSZjNoaK{YrM7E{>JsvKcbzAi2#n*}bhiOB@d1z1iYO9$`O6 z?7J%%bH#~PX196b)Decsd~rA*|1A*5&XH&qij$w=gGJ(aAvIoX?4IUYB2JYP;7i4^ z27I$j9CfcDEjOE!_-}>St0ojyn*BwBeU&&~>=M4|pHb&E#wA3^TH`WK57rrffbZ9f zLvAUxL7a5`xzYFtfxk)Y1sU?2#eNm_*<$SO*K8Gs3yG<1rn{$^wu`-LdT56@*~~!M z3A-reP`iZfB&L8YLd|qyuGrtt^p-D9xJRh>h-2<7v%u`#a4t0a4@p2p;-p&?>=UQz z@mjGs_9@YOKpd}SsFsNRMtY@G9IBv4%fyM3B)%hN?{2`9oBdIGu0q_m8;?}NwCZ;> z@Py<<8?#82mi}xrdCp#mN(d(M=df{g$TRk{oJbg1960&ys<9#PM=MgaUc^2je$;Z5lrQ!!&_gM+KTZ=L ziv8W_kB!|UX-~v4H&UL8J$G01ndyZz_qjOQNNm0^eIIfCQXD@?H-0Df-R-Qb|C3Jo zA@|I4wm9Nm;2&t5L$-NE?46@~2Z`fuVjXPyc}DjTarhvU?@)29fcCy>`umhKOzgi; zHhWDRYNGzbO|PSz5yo!8HBuaPk6gYkesqaY8YND+Q@+t+?*i=_BaWS;kH(sQo@0$O z`!n?8c(eJCewiTlFERa26vrFz;UuxwO2|ykF!h-t4!H-4r<#7A<4zO%wG5Z(;#eKy zdxki9h&gwrINDrA<~RF`)ntCNZ=pVO#EB2+i@9cVmbTB!Fm;=6_IvT$0&)B>bz3M- z9%5;*$ZXu(nv2E$ZsKZ*v3vArsn|P(&zHdri`#VXa^e4E{#aoa)wE=#*l%PmS!KGr z-M3mCYNhlw;)Kh^Yhg<4{D`zKyvs4yo3(pGXoEPMkM%~e-@^FYB#t*TlWZ2pT>G}b zlz8JalD=f`Q&vRV%)EpOY!`o9MwZ(l4wuulJH?6fxPF&7R!S-kh$A_0t~h>@VVp1a zs%h*VaiZyS(!MxZO~VVt(Pl!VNF3fxm+TXJHH3b#*#`)h1L9B~`<96PTJ|j!$BG!z zWv07!9WicWn3anY?i92_+`BDrNIEp*Twal$<^*dtCJrkH!;sC$3!zSuuQLmrsTQS3djS4FN2o6Q~uP*j{c zNR4A=?}8*D?yDold(Gwu!JHC@%gOH##W8n7`>}C7dEtpT-bOG#g2hedEt?ch)nNf-kc{Ubc!IXMheAC*7ISE8@f%^g-fKBMuxa z_Uozq5OJS-3u~y^G}D8xiladybC}r|FsHv}+(b$lE{>Hm35|eh)Lnvcq~u5rzJJ}! z8|kZ2X6|17A1(7Q8Ixno+&xx4RvdP>J;sTn?h_*8#s2%m{RDBSg)W&WPSh~|CyB%E zvF^#nxzuusIDUbaOcnRKE}SL~1>oso?^E`j0n?JN84feeq>VH(OB}AC+h&XX^dp+$ zlzTmYt~laeGMxug%vCynzT{{VxpaZp|EQ3JZ(PMFTqI7Mp*D-f$s+{j5^--`ISC)8 z$lK)7Ws>~@D!kmdkYZNIYxMH z3-2>bH<-11JZ_`0yH2x7?C)pXZ5H?antE-4od?);tMFe*E8Aq@oh1sk8@r1}JB(`y zrk!HH3Y%Txc>3u~apE&ctGq`!6o8v-P2K}V!sp*mx*JgxbKKK=@xM1;=Yq4oC=sKbXAb-C5PNM zXHJOYZTP85oVY}IREz!Nw4?@R*RP0_Q9`U#N%%k_j zUJWVuzBuMyZhv6*0qi{(citnu!)9L3u2C~NTtK!L`#Cf#VK(lpt=ISh10p33wK57H zn*AQ!`PjHApKLFVH_;VOVGi3t3_g>b_?%(*T=G>y@P)983cQp>sF?u&PV9eyKeGNU zo#B&}VWq!T235yh(42k#XF;Y&lk( zxI~&ACr-Jwz<9COh6g5yL+<6OiDKV99XLrGcG-HeINFFlMI5W33#N+WXW4g}IB|)F zOgEcfQ->L*e?qj*6!)HEyw4Ixa;V#EvA-YtIWP^p!62C{Ioiy?n`b5;F(J(t$I8AS z)r-9bW`l)hvm5^{GOi^$78@53=u6E00x_}F_#y*lnKNsD)Iw19nZJSzi1-t;r0BD(G|w-K3k>Oa|7svIMkd+!WT!}BN5f&XcMEk zM(ppW%u~khGUsV=-zn^CjonmNC-&T(f_iZ>K;0U|sbiFLA!9@Q2E|eLshuWq>>zD# z5hrT!TbnrP9+|%^?saDd?c!7gz1t!7-NPo=#IXW0?{#tCDaz~=N8LNfH^d?L%te>k zySJ&k#or#J{x?nkfwz21fGh0 zw?KU+j=Q&Bp8wmmVee-T{;N0m^_ic){T4sJ$3f^sed}HKgW7NX`ILWpYs%!wZ@$H^ zrEdAhUu>KFm;C+3TkP_SfBxx@|L(1~woMt4Jv#7Wz~rnM1Gm1ted|veU;g*MSsutg z^Va6e`G5T{Sy{9HCNN-e!GP@I0p94c0Rt)ryjnA0K;3`=!2ts<56HScU_keP0fFoR z1=$0Nv$M*w2b{>xx}2SLJ$pcRc6Lwp|IPdj2xtGn=y1;aqkFUeU?#uEGV4k9_h!B~ z>qYj!nQy!|EBAYUFmv Date: Sat, 16 Mar 2024 12:32:35 -0500 Subject: [PATCH 127/268] Add `generate_config_cpu_optimizer.sh` + Update `ALCF/helpers.sh` to toggle between --- ALCF/helpers.sh | 8 +- generate_config_cpu_optimizer.sh | 151 +++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+), 1 deletion(-) create mode 100644 generate_config_cpu_optimizer.sh diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index c328f0ec63..be3764450c 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -165,7 +165,13 @@ buildDSconfig() { echo "DS_CONFIG: ${DS_CONFIG}" printf "ZS: %s, MB: %s, GB: %s, PP: %s, DTYPE: %s" ${ZERO_STAGE} ${MICRO_BATCH} ${GLOBAL_BATCH} ${PP} ${DTYPE} # generateConfig "${DS_CONFIG}" - bash "${PBS_O_WORKDIR}/generate_config.sh" "${DS_CONFIG}" #|| exit 1 + use_cpu_opt=$1 + if [[ $use_cpu_opt ]]; then + echo "!!! Using CPU Optimizer !!!" + bash "${PBS_O_WORKDIR}/generate_config_cpu_optimizer.sh" "${DS_CONFIG}" + else + bash "${PBS_O_WORKDIR}/generate_config.sh" "${DS_CONFIG}" #|| exit 1 + fi # ------------------------------------------------------------- } diff --git a/generate_config_cpu_optimizer.sh b/generate_config_cpu_optimizer.sh new file mode 100644 index 0000000000..99dec97958 --- /dev/null +++ b/generate_config_cpu_optimizer.sh @@ -0,0 +1,151 @@ +#!/bin/bash --login + +for v in "$GLOBAL_BATCH" "$MICRO_BATCH" "$GRAD_ACC_STEPS" "$ZERO_STAGE" \ + "$PP" "$DTYPE" +do + if [ -z $v ]; then + echo "Please export required envs before execute $0" + exit 1 + fi +done + +if [ $# -ne 1 ]; then + echo "Usage: $0 config_file" + exit 1 +fi + +extra="" +common="\ + \"train_batch_size\": $GLOBAL_BATCH, + \"train_micro_batch_size_per_gpu\": $MICRO_BATCH, + \"steps_per_print\": 1, + \"gradient_accumulation_steps\": $GRAD_ACC_STEPS, + \"optimizer\": { + \"type\": \"AdamW\", + \"params\": { + \"lr\": ${LR}, + \"beta1\": 0.9, + \"beta2\": 0.95, + \"eps\": 1e-5, + \"weight_decay\": 1e-1 + } + }, + \"scheduler\": { + \"type\": \"WarmupLR\", + \"params\": { + \"warmup_min_lr\": 0.00003, + \"warmup_max_lr\": 0.0003, + \"warmup_num_steps\": 5000 + } + }, + \"zero_allow_untested_optimizer\": true, + \"gradient_clipping\": 1.0, + \"activation_checkpointing\": { + \"partition_activations\": true, + \"contiguous_memory_optimization\": false + }, + \"wall_clock_breakdown\": false," + +flops_profiler="\ + \"flops_profiler\": { + \"enabled\": false, + \"profile_step\": 45, + \"module_depth\": -1, + \"top_modules\": 1, + \"detailed\": true, + \"output_file\": null + }" + +if [[ $DTYPE == "bf16" ]]; then +dtype="\ + \"communication_data_type\": \"bfp16\", + \"fp16\": { + \"enabled\": false, + \"loss_scale\": 0, + \"loss_scale_window\": 1000, + \"hysteresis\": 2, + \"min_loss_scale\": 1 + }, + \"bfloat16\": { + \"enabled\": true, + \"loss_scale\": 1.0 + }," +else +dtype="\ + \"communication_data_type\": \"fp16\", + \"fp16\": { + \"enabled\": true, + \"loss_scale\": 0, + \"loss_scale_window\": 1000, + \"hysteresis\": 2, + \"min_loss_scale\": 1 + }, + \"bfloat16\": { + \"enabled\": false, + \"loss_scale\": 1.0 + }," +fi + +if [ $ZERO_STAGE == 3 ]; then +zero="\ + \"zero_optimization\": { + \"stage\": 3, + \"reduce_scatter\": false, + \"stage3_max_live_parameters\": 3e9, + \"stage3_max_reuse_distance\": 3e9, + \"stage3_param_persistence_threshold\": 1e5, + \"stage3_prefetch_bucket_size\": 5e7, + \"contiguous_gradients\": true, + \"overlap_comm\": true, + \"reduce_bucket_size\": 90000000, + \"sub_group_size\": 1e9, + \"offload_optimizer\": { + \"device\": \"none\", + \"buffer_count\": 4, + \"pipeline_read\": false, + \"pipeline_write\": false, + \"pin_memory\": true + } + }," +elif [ $ZERO_STAGE == 2 ] || [ $ZERO_STAGE == 1 ]; then +zero="\ + \"zero_optimization\": { + \"stage\": $ZERO_STAGE, + \"offload_optimizer\": { + \"device\": \"cpu\", + \"buffer_count\": 4, + \"pipeline_read\": false, + \"pipeline_write\": false, + \"pin_memory\": true + } + }," + if [ $ZERO_STAGE == 1 ]; then + if [ $PP > 1 ]; then + extra="\ + \"data_types\": { + \"grad_accum_dtype\": \"fp32\" + }, + \"comms_logger\": { + \"enabled\": true, + \"verbose\": false, + \"prof_all\": true, + \"debug\": false + }," + else + echo 'please add the config for zero_stage 1 without pipeline-parallelism' + fi + fi +else + echo 'Please add the correct config set!!!' +fi + +# flops_profiler must at the end because no ',' is allowed at the end +cat < $1 +{ +$common +$zero +$dtype +$extra +$flops_profiler +} +EOT From 20492989b48a16855c3b4cbc4b9b522ee9a960eb Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sat, 16 Mar 2024 12:52:17 -0500 Subject: [PATCH 128/268] Catch `CPU_OPTIMIZER` in `ALCF/helpers.sh` @ `setParams()` --- ALCF/helpers.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index be3764450c..c0c68b3537 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -83,6 +83,10 @@ setParams() { export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" + if [[ "${CPU_OPTIMIZER:-0}" ]]; then + echo "\n!!! Appending \`--cpu-optimizer\` to LLAMA_ARGS..." + export LLAMA_ARGS="${LLAMA_ARGS} --cpu-optimizer" + fi # ---------------------------------------------------- } From f44f61afd2d11fd8c340139c6db71894ca4b5d01 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sat, 16 Mar 2024 12:52:49 -0500 Subject: [PATCH 129/268] Update `train_llama_alcf_polaris.sh` --- train_llama_alcf_polaris.sh | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/train_llama_alcf_polaris.sh b/train_llama_alcf_polaris.sh index 8b3edf41f2..915b5435c8 100644 --- a/train_llama_alcf_polaris.sh +++ b/train_llama_alcf_polaris.sh @@ -32,7 +32,7 @@ saveDSenv || exit # 2. save env vars to `.deepspeed_env` ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` -buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ +buildDSconfig "${CPU_OPTIMIZER:-0}" || exit # 6. create `deepspeed_config.json` from runtime params from ^ setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} setArgs || exit # 8. specify additional `deepspeed` arguments setData "${DATA_FILE_LIST}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset @@ -46,8 +46,13 @@ custom_args=" $@" # Assert `./hostfile_deepspeed` exists export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit +# source "${HERE}/venvs/polaris/2024-03-14/bin/activate" || exit +# echo "Using $(which python3)" +# --launcher_args='--pmi=pmix' + # deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} \ + # ${launch_cmd} \ run_cmd=" - deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} \ + deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ --use-flash-attn-v2 \ --$DTYPE \ --num-workers 0 \ @@ -69,7 +74,7 @@ run_cmd=" --hidden-size ${HIDDEN} \ --train-iters ${TRAIN_ITER} \ --eval-iters ${EVAL_ITERS} \ - --distributed-backend ${NCCL} \ + --distributed-backend ${BE} \ --num-attention-heads ${HEADS} \ --save-interval ${SAVE_INTERVAL} \ --eval-interval ${EVAL_INTERVAL} \ @@ -90,24 +95,6 @@ run_cmd=" |& tee ${OUTPUT_LOG} " - # --------------------------------------------------- - # --vocab-file $VOCAB_FILE \ - # --merge-file $MERGE_FILE \ - # --lr-decay-iters 320000 \ - # --lr-warmup-iters 5000 \ - # --lr-decay-iters 10000 \ - # --num-workers 4 \ - # launch python3 ${EXEC} \ - # --data-impl mmap \ - # source ./ezpz/src/ezpz/bin/getjobenv || exit - # --------------------------------------------------- - # ${DIST_LAUNCH} ./local_rank.sh python3 ${EXEC} \ - # ${DIST_LAUNCH} python3 ${EXEC} \ - # deepspeed $launcher ${EXEC} \ - # >> ${OUTPUT_LOG} 2>&1 & - # >> ${OUTPUT_LOG} 2>&1 & - # |& tee $OUTPUT_DIR/output.log - # ${EXTRA_ARGS} \ echo "All DeepSpeed(s): $(which -a deepspeed)" echo "Using $(which deepspeed)" From 0527c71d7eccf3b9c9dddafd0c4c0d48b6ae4eef Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 18 Mar 2024 07:12:24 -0500 Subject: [PATCH 130/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index c0c68b3537..bc2adb26fa 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -40,12 +40,15 @@ setParams() { export BE="${CCL}" # BE = CCL export DTYPE=${DTYPE:-bf16} # DTYPE: bf16 MICRO_BATCH=${MICRO_BATCH:-4} # MICRO_BATCH = 4 + echo "!!!! Using CPU_OPTIMIZER on Intel XPU by Default !!!!" + export CPU_OPTIMIZER=${CPU_OPTIMIZER:-1} # CPU OPTIMIZER ON INTEL XPU # -------- [Polaris] ----------------------------------- elif [[ $(hostname) == x3* ]]; then TP=${TP:-2} # TP = 2 PP=${PP:-1} # PP = 1 export NCCL=${NCCL:-nccl} # NCCL export BE="${NCCL}" # BE = NCCL + # export DTYPE=${DTYPE:-bf16} # DTYPE: BF16 ?? export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 MICRO_BATCH=${MICRO_BATCH:-8} # MICRO_BATCH = 8 fi @@ -83,7 +86,8 @@ setParams() { export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" - if [[ "${CPU_OPTIMIZER:-0}" ]]; then + # if [[ "${CPU_OPTIMIZER:-0}" ]]; then + if [[ -n "${CPU_OPTIMIZER}" ]]; then echo "\n!!! Appending \`--cpu-optimizer\` to LLAMA_ARGS..." export LLAMA_ARGS="${LLAMA_ARGS} --cpu-optimizer" fi @@ -168,13 +172,11 @@ buildDSconfig() { export DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" echo "DS_CONFIG: ${DS_CONFIG}" printf "ZS: %s, MB: %s, GB: %s, PP: %s, DTYPE: %s" ${ZERO_STAGE} ${MICRO_BATCH} ${GLOBAL_BATCH} ${PP} ${DTYPE} - # generateConfig "${DS_CONFIG}" - use_cpu_opt=$1 - if [[ $use_cpu_opt ]]; then + if [[ -z "${CPU_OPTIMIZER}" ]]; then + bash "${PBS_O_WORKDIR}/generate_config.sh" "${DS_CONFIG}" #|| exit 1 + else echo "!!! Using CPU Optimizer !!!" bash "${PBS_O_WORKDIR}/generate_config_cpu_optimizer.sh" "${DS_CONFIG}" - else - bash "${PBS_O_WORKDIR}/generate_config.sh" "${DS_CONFIG}" #|| exit 1 fi # ------------------------------------------------------------- } From cb54fed036ef153cbfbdc018be797f67f78df8a3 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 18 Mar 2024 07:12:44 -0500 Subject: [PATCH 131/268] Update `train_llama_alcf_polaris.sh` --- train_llama_alcf_polaris.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_llama_alcf_polaris.sh b/train_llama_alcf_polaris.sh index 915b5435c8..2e1a23010c 100644 --- a/train_llama_alcf_polaris.sh +++ b/train_llama_alcf_polaris.sh @@ -32,7 +32,7 @@ saveDSenv || exit # 2. save env vars to `.deepspeed_env` ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` -buildDSconfig "${CPU_OPTIMIZER:-0}" || exit # 6. create `deepspeed_config.json` from runtime params from ^ +buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} setArgs || exit # 8. specify additional `deepspeed` arguments setData "${DATA_FILE_LIST}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset From b4b101d37f30c25d863d0562dcbd9748a055a1a4 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 18 Mar 2024 09:59:49 -0500 Subject: [PATCH 132/268] Update `train_llama_alcf_sunspot.sh` --- train_llama_alcf_sunspot.sh | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/train_llama_alcf_sunspot.sh b/train_llama_alcf_sunspot.sh index d4cec63e33..d5e83c57a0 100644 --- a/train_llama_alcf_sunspot.sh +++ b/train_llama_alcf_sunspot.sh @@ -45,6 +45,10 @@ module () { # eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" # conda activate q4-drop +if [[ $(hostname) == x1* || $(hostname) == x4* ]] ; then + echo "!!!! Caught Intel XPU, using CPU_OPTIMIZER !!!!" + export CPU_OPTIMIZER=1; +fi # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ @@ -52,27 +56,24 @@ module () { cd "${PBS_O_WORKDIR}" || exit HERE=$(python3 -c 'import os; print(os.getcwd())') export HERE - -# PARENT="$(dirname "${HERE}")" -# source "${PARENT}/setenv.sh" || exit # ---- 1. Assert `./pretrain_gpt_alcf.py` exists: ----------------------------- export EXEC="${HERE}/pretrain_gpt_alcf.py" [ -f "${EXEC}" ] || exit # ---- 2. `source ./ALCF/helpers_alcf.sh`: ------------------------------------ sourceFile "${HERE}/ALCF/helpers.sh" || exit -# ---- 3. Call fns from `./ALCF/helpers_alcf.sh` ------------------------------ -setEnv || exit # 1. load `conda` environment -saveDSenv || exit # 2. save env vars to `.deepspeed_env` -ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars -makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` -setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` -buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ -setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} -setArgs || exit # 8. specify additional `deepspeed` arguments -setData "${DATA_FILE_LIST}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset -setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` -printJobInfo || exit # 11. print job info -# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +# ---- 3. Call fns from `./ALCF/helpers_alcf.sh` ------------------------------------------------------------------ +setEnv || exit # 1. load `conda` environment +saveDSenv || exit # 2. save env vars to `.deepspeed_env` +ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars +makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` +setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` +buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ +setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} +setArgs || exit # 8. specify additional `deepspeed` arguments +setData "${DATA_FILE_LIST}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset +setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` +printJobInfo || exit # 11. print job info +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # Take custom args custom_args=" $@" @@ -95,7 +96,6 @@ export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit run_cmd=" deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ --$DTYPE \ - --cpu-optimizer \ --num-workers 0 \ --split 100,0,0 \ --log-interval 1 \ From 315382519de59b243f9ece73e5cc2a94f9886453 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 18 Mar 2024 14:58:52 -0500 Subject: [PATCH 133/268] Update README.md --- ALCF/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/ALCF/README.md b/ALCF/README.md index 49b583aa24..1a8612ed8a 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -98,6 +98,7 @@ ```bash $ conda activate "${DAY}" # e.g. 2024-03-07 $ conda install -c pytorch -c nvidia --solver libmamba mpi4py ninja transformers xformers triton pytorch torchvision torchaudio pytorch-cuda=11.8 + $ conda install --solver libmamba mpi4py -c conda-forge -c pytorch -c nvidia $ python3 -m pip install --upgrade pip pybind11 toolong appdirs wandb sentencepiece ipython setuptools wheel ninja $ python3 -m pip install --upgrade deepspeed wandb ``` From 2d46039a7d58188bb66816030c60f2f9ef389fc6 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 22 Mar 2024 10:47:41 -0500 Subject: [PATCH 134/268] Add `--train-iters-to-skip` option for skipping backprop on certain train iters --- megatron/arguments.py | 8 +++++--- megatron/core/pipeline_parallel/schedules.py | 11 +++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index d83fe99856..3b83d16299 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -959,6 +959,11 @@ def _add_training_args(parser): group.add_argument('--use-dataset-only', type=bool, required=False, default=False, help='If set to True, only use the megatron dataset for external trainer ') group.add_argument('--profile', action='store_true', help='Enable Torch Profiler') + group.add_argument('--train-iters-to-skip', action="extend", nargs="+", type=str, + help=( + "Specific train iterations to skip when training. " + "Load the data and just perform a noop." + )) return parser @@ -1510,7 +1515,6 @@ def _add_activation_checkpoint_args(parser): def _add_distillation_args(parser): group = parser.add_argument_group('Knowledge distillation', 'Distillation Configurations') - group.add_argument('--num-layers-teacher', type=int, default=None, help='Number of the teacher transformer layers.') group.add_argument('--num-experts-teacher', type=int, nargs='+', default=[1,], @@ -1519,7 +1523,6 @@ def _add_distillation_args(parser): help='Tansformer teacher hidden size.') group.add_argument('--num-attention-heads-teacher', type=int, default=None, help='Number of teacher transformer attention heads.') - group.add_argument('--mos', action='store_true', help='Enable Mixture-of-Students via knolwedge distillation.') group.add_argument('--kd', action='store_true', @@ -1529,7 +1532,6 @@ def _add_distillation_args(parser): group.add_argument('--kd-temp', default=1.0, type=float) group.add_argument('--reset-iteration', action='store_true', help='Reset the iteration count.') - group.add_argument('--load-teacher', type=str, default=None, help='Directory containing a teacher model checkpoint.') diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index 407bb16d56..bb876481ac 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -241,6 +241,17 @@ def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, c # needs to be modified slightly to support arbitrary numbers of skip # connections. args = get_args() + assert args is not None + if config.timers is not None: + config.timers('backward-compute', log_level=2).start() + # if (to_skip := args.train_iters_to_skip) is not None and len(to_skip) > 0: + to_skip = getattr(args, 'train_iters_to_skip', None) + if to_skip is not None: + if config.timers is not None: + config.timers('backward-compute').stop() + if args.iteration in [int(i) for i in to_skip]: + print(f'Caught {args.iteration=} in `iters_to_skip`! Skipping!') + return [None] if args.deepspeed: assert model is not None From d96a58535327ac0ef3bdead29ffd833a0661a631 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 22 Mar 2024 12:23:08 -0500 Subject: [PATCH 135/268] [fix] `megatron/training.py` Fix bug when passing `--train-iters-to-skip` when running **without** setting `CPU_OPTIMIZER=1` --- megatron/training.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 7e6c7dc6bb..e987eb158d 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -728,8 +728,11 @@ def train_step(forward_step_func, data_iterator, increment = get_num_microbatches() * \ args.micro_batch_size * \ args.data_parallel_size - model[0].step(lr_kwargs={'increment': increment}) - update_successful = model[0].was_step_applied() + try: + model[0].step(lr_kwargs={'increment': increment}) + update_successful = model[0].was_step_applied() + except Exception: + update_successful = False else: update_successful, grad_norm, num_zeros_in_grad = optimizer.step(args, timers) timers('optimizer').stop() From 038109af825d62029f00c9717a2948b2adafa362 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 25 Mar 2024 20:33:52 -0500 Subject: [PATCH 136/268] Add `--optimizer adamw` for `torch.optim.AdamW` --- megatron/arguments.py | 4 +-- megatron/optimizer/__init__.py | 50 ++++++++++++++++++++++------------ 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 3b83d16299..cd618b285e 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -800,7 +800,7 @@ def _add_training_args(parser): ' ' ' ' 'For example:' - ' --rampup-batch-size 16 8 300000 \ ' + ' --rampup-batch-size 16 8 300000 \\ ' ' --global-batch-size 1024' 'will start with global batch size 16 and over ' ' (1024 - 16) / 8 = 126 intervals will increase' @@ -914,7 +914,7 @@ def _add_training_args(parser): help='Disable bias in the linear layers', dest='add_bias_linear') group.add_argument('--optimizer', type=str, default='adam', - choices=['adam', 'sgd'], + choices=['adam', 'adamw', 'sgd'], help='Optimizer function') group.add_argument('--dataloader-type', type=str, default=None, choices=['single', 'cyclic'], diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index 12a458375d..e7400c39b5 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -1,12 +1,13 @@ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. from deepspeed.accelerator import get_accelerator -if get_accelerator().device_name() == 'cuda': - from apex.optimizers import FusedAdam as Adam - from apex.optimizers import FusedSGD as SGD -else: - from torch.optim import Adam - from torch.optim import SGD +# if get_accelerator().device_name() == 'cuda': +# from apex.optimizers import FusedAdam as Adam +# from apex.optimizers import FusedSGD as SGD +# else: +# from torch.optim import Adam +# from torch.optim import SGD +import torch from megatron import get_args @@ -93,24 +94,37 @@ def get_megatron_optimizer(model, betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps) else: - if args.optimizer == 'adam': + if str(args.optimizer).lower() == 'adamw': + optimizer = torch.optim.AdamW( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps + ) + elif args.optimizer == 'adam': if args.ds_fused_adam: global Adam from deepspeed.ops.adam import FusedAdam Adam = FusedAdam - optimizer = Adam(param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - betas=(args.adam_beta1, args.adam_beta2), - eps=args.adam_eps) + else: + Adam = torch.optim.Adam + optimizer = Adam( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps + ) elif args.optimizer == 'sgd': - optimizer = SGD(param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - momentum=args.sgd_momentum) + optimizer = torch.optim.SGD( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + momentum=args.sgd_momentum + ) else: - raise Exception('{} optimizer is not supported.'.format( - args.optimizer)) + raise Exception(f'{args.optimizer} optimizer is not supported.') if args.deepspeed: return optimizer From 66401b8e197dbcc649f779119e5cb8f2303ea929 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 26 Mar 2024 08:42:02 -0500 Subject: [PATCH 137/268] Update `pretrain_gpt_alcf.py` --- pretrain_gpt_alcf.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 4fefef795f..e296a637be 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -153,12 +153,14 @@ def model_provider(pre_process=True, post_process=True): see_memory_usage("After Building Model", force=True) if wandb.run is not None: wandb.run.config.update({'num_params': num_params}) - # wandb.run.watch( - # model, - # log='all', - # log_graph=True, - # ) - # wandb.run.config.update({'num_params': num_params}) + try: + wandb.run.watch( + model, + log='all', + log_graph=True, + ) + except Exception: + pass return model From c87ea6aafe82924e4b9b44c066ebcd238572d722 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 26 Mar 2024 09:22:50 -0500 Subject: [PATCH 138/268] Remove redundant `generate_config_cpu_optimizer.sh` --- generate_config_cpu_optimizer.sh | 151 ------------------------------- 1 file changed, 151 deletions(-) delete mode 100644 generate_config_cpu_optimizer.sh diff --git a/generate_config_cpu_optimizer.sh b/generate_config_cpu_optimizer.sh deleted file mode 100644 index 99dec97958..0000000000 --- a/generate_config_cpu_optimizer.sh +++ /dev/null @@ -1,151 +0,0 @@ -#!/bin/bash --login - -for v in "$GLOBAL_BATCH" "$MICRO_BATCH" "$GRAD_ACC_STEPS" "$ZERO_STAGE" \ - "$PP" "$DTYPE" -do - if [ -z $v ]; then - echo "Please export required envs before execute $0" - exit 1 - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 config_file" - exit 1 -fi - -extra="" -common="\ - \"train_batch_size\": $GLOBAL_BATCH, - \"train_micro_batch_size_per_gpu\": $MICRO_BATCH, - \"steps_per_print\": 1, - \"gradient_accumulation_steps\": $GRAD_ACC_STEPS, - \"optimizer\": { - \"type\": \"AdamW\", - \"params\": { - \"lr\": ${LR}, - \"beta1\": 0.9, - \"beta2\": 0.95, - \"eps\": 1e-5, - \"weight_decay\": 1e-1 - } - }, - \"scheduler\": { - \"type\": \"WarmupLR\", - \"params\": { - \"warmup_min_lr\": 0.00003, - \"warmup_max_lr\": 0.0003, - \"warmup_num_steps\": 5000 - } - }, - \"zero_allow_untested_optimizer\": true, - \"gradient_clipping\": 1.0, - \"activation_checkpointing\": { - \"partition_activations\": true, - \"contiguous_memory_optimization\": false - }, - \"wall_clock_breakdown\": false," - -flops_profiler="\ - \"flops_profiler\": { - \"enabled\": false, - \"profile_step\": 45, - \"module_depth\": -1, - \"top_modules\": 1, - \"detailed\": true, - \"output_file\": null - }" - -if [[ $DTYPE == "bf16" ]]; then -dtype="\ - \"communication_data_type\": \"bfp16\", - \"fp16\": { - \"enabled\": false, - \"loss_scale\": 0, - \"loss_scale_window\": 1000, - \"hysteresis\": 2, - \"min_loss_scale\": 1 - }, - \"bfloat16\": { - \"enabled\": true, - \"loss_scale\": 1.0 - }," -else -dtype="\ - \"communication_data_type\": \"fp16\", - \"fp16\": { - \"enabled\": true, - \"loss_scale\": 0, - \"loss_scale_window\": 1000, - \"hysteresis\": 2, - \"min_loss_scale\": 1 - }, - \"bfloat16\": { - \"enabled\": false, - \"loss_scale\": 1.0 - }," -fi - -if [ $ZERO_STAGE == 3 ]; then -zero="\ - \"zero_optimization\": { - \"stage\": 3, - \"reduce_scatter\": false, - \"stage3_max_live_parameters\": 3e9, - \"stage3_max_reuse_distance\": 3e9, - \"stage3_param_persistence_threshold\": 1e5, - \"stage3_prefetch_bucket_size\": 5e7, - \"contiguous_gradients\": true, - \"overlap_comm\": true, - \"reduce_bucket_size\": 90000000, - \"sub_group_size\": 1e9, - \"offload_optimizer\": { - \"device\": \"none\", - \"buffer_count\": 4, - \"pipeline_read\": false, - \"pipeline_write\": false, - \"pin_memory\": true - } - }," -elif [ $ZERO_STAGE == 2 ] || [ $ZERO_STAGE == 1 ]; then -zero="\ - \"zero_optimization\": { - \"stage\": $ZERO_STAGE, - \"offload_optimizer\": { - \"device\": \"cpu\", - \"buffer_count\": 4, - \"pipeline_read\": false, - \"pipeline_write\": false, - \"pin_memory\": true - } - }," - if [ $ZERO_STAGE == 1 ]; then - if [ $PP > 1 ]; then - extra="\ - \"data_types\": { - \"grad_accum_dtype\": \"fp32\" - }, - \"comms_logger\": { - \"enabled\": true, - \"verbose\": false, - \"prof_all\": true, - \"debug\": false - }," - else - echo 'please add the config for zero_stage 1 without pipeline-parallelism' - fi - fi -else - echo 'Please add the correct config set!!!' -fi - -# flops_profiler must at the end because no ',' is allowed at the end -cat < $1 -{ -$common -$zero -$dtype -$extra -$flops_profiler -} -EOT From 37f6d3a757e7393751cdbaa48bea6e4754024a9e Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 26 Mar 2024 09:23:08 -0500 Subject: [PATCH 139/268] Update `train_llama_alcf_sunspot.sh` --- train_llama_alcf_sunspot.sh | 35 +++++++++-------------------------- 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/train_llama_alcf_sunspot.sh b/train_llama_alcf_sunspot.sh index d5e83c57a0..f000506bb9 100644 --- a/train_llama_alcf_sunspot.sh +++ b/train_llama_alcf_sunspot.sh @@ -41,15 +41,6 @@ module () { return $__lmod_my_status } -# -# eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" -# conda activate q4-drop - -if [[ $(hostname) == x1* || $(hostname) == x4* ]] ; then - echo "!!!! Caught Intel XPU, using CPU_OPTIMIZER !!!!" - export CPU_OPTIMIZER=1; -fi - # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # ---- 0. Navigate into `$PBS_O_WORKDIR` ------------------------------------- @@ -67,7 +58,7 @@ saveDSenv || exit # 2. save env vars to `.deepspeed_ ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` -buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ +buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} setArgs || exit # 8. specify additional `deepspeed` arguments setData "${DATA_FILE_LIST}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset @@ -81,21 +72,15 @@ custom_args=" $@" # Assert `./hostfile_deepspeed` exists export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit -# hf="${HOSTFILE:-${PBS_NODEFILE}}" -# nh=$(wc -l "${hf}") -# if [[ "${nh}" -gt 1 ]]; then -# launch_cmd="deepspeed --hostfile $hfds --launcher MPICH ${EXEC}" -# else -# launch_cmd="python3 ${EXEC}" -# fi -# -# echo "launch_cmd: ${launch_cmd}" - # --use-flash-attn-v2 \ - # python3 ${EXEC} \ + # --use-flash-attn \ + # --$DTYPE \ + # --optimizer adamw \ + # --adam-beta1 0.9 \ + # --adam-beta2 0.95 \ run_cmd=" deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ - --$DTYPE \ + --optimizer ${OPT} \ --num-workers 0 \ --split 100,0,0 \ --log-interval 1 \ @@ -108,9 +93,9 @@ run_cmd=" --accumulate-allreduce-grads-in-fp32 \ --use-checkpoint-opt_param-scheduler \ --lr ${LR} \ - --seq-length $SEQ \ --save ${CKPT_DIR} \ --load ${CKPT_DIR} \ + --seq-length ${SEQ} \ --num-layers ${NLAYERS} \ --hidden-size ${HIDDEN} \ --train-iters ${TRAIN_ITER} \ @@ -155,12 +140,10 @@ run_cmd=" # |& tee $OUTPUT_DIR/output.log # ${EXTRA_ARGS} \ -echo "All DeepSpeed(s): $(which -a deepspeed)" -echo "Using $(which deepspeed)" +echo "! Using $(which deepspeed)" ds_report echo "${run_cmd}" - printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" # echo "${OUTPUT_LOG}" From 7befd20801d4f62e9e217636a9e1ed3e73c13f71 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 26 Mar 2024 09:24:03 -0500 Subject: [PATCH 140/268] Update `megatron/core/pipeline_parallel/schedules.py` --- megatron/core/pipeline_parallel/schedules.py | 32 +++++++++----------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py index bb876481ac..453019f8c9 100644 --- a/megatron/core/pipeline_parallel/schedules.py +++ b/megatron/core/pipeline_parallel/schedules.py @@ -14,7 +14,7 @@ from megatron.core.enums import ModelType from megatron.core.utils import get_attr_wrapped_model, get_model_type, get_model_config -from megatron.utils import unwrap_model +from megatron.utils import print_rank_0, unwrap_model from megatron.model import DistributedDataParallel as LocalDDP from megatron.model import Float16Module @@ -228,7 +228,14 @@ def forward_step(forward_step_func, return [output_tensor] -def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config, model=None): +def backward_step( + input_tensor, + output_tensor, + output_tensor_grad, + model_type, + config, + model=None +): """Backward step through passed-in output tensor. If last stage, output_tensor_grad is None, otherwise gradient of loss @@ -244,20 +251,16 @@ def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, c assert args is not None if config.timers is not None: config.timers('backward-compute', log_level=2).start() - # if (to_skip := args.train_iters_to_skip) is not None and len(to_skip) > 0: - to_skip = getattr(args, 'train_iters_to_skip', None) - if to_skip is not None: + if (to_skip := getattr(args, 'train_iters_to_skip', None)) is not None: if config.timers is not None: config.timers('backward-compute').stop() - if args.iteration in [int(i) for i in to_skip]: - print(f'Caught {args.iteration=} in `iters_to_skip`! Skipping!') + if len(to_skip) > 0 and args.iteration in [int(i) for i in to_skip]: + print_rank_0( + f'Caught {args.iteration=} in `iters_to_skip`! Skipping!' + ) return [None] if args.deepspeed: assert model is not None - - if config.timers is not None: - config.timers('backward-compute', log_level=2).start() - # Retain the grad on the input_tensor. unwrap_input_tensor_grad = False if not isinstance(input_tensor, list): @@ -266,24 +269,20 @@ def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, c for x in input_tensor: if x is not None: x.retain_grad() - if not isinstance(output_tensor, list): output_tensor = [output_tensor] if not isinstance(output_tensor_grad, list): output_tensor_grad = [output_tensor_grad] - # Backward pass. if args.deepspeed: model.backward(output_tensor[0]) else: if output_tensor_grad[0] is None and config.grad_scale_func is not None: output_tensor[0] = config.grad_scale_func(output_tensor[0]) - if config.deallocate_pipeline_outputs: custom_backward(output_tensor[0], output_tensor_grad[0]) else: torch.autograd.backward(output_tensor[0], grad_tensors=output_tensor_grad[0]) - # Collect the grad of the input_tensor. input_tensor_grad = [None] if input_tensor is not None: @@ -293,7 +292,6 @@ def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, c input_tensor_grad.append(None) else: input_tensor_grad.append(x.grad) - # Handle single skip connection if it exists (encoder_hidden_state in # model with encoder and decoder). if parallel_state.get_pipeline_model_parallel_world_size() > 1 and \ @@ -303,10 +301,8 @@ def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, c input_tensor_grad[-1].add_(output_tensor_grad[1]) if unwrap_input_tensor_grad: input_tensor_grad = input_tensor_grad[0] - if config.timers is not None: config.timers('backward-compute').stop() - return input_tensor_grad From 25353fda499d23bd3d6b42986cbf0f5353537eab Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 26 Mar 2024 09:24:30 -0500 Subject: [PATCH 141/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index bc2adb26fa..30d9add286 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -40,8 +40,11 @@ setParams() { export BE="${CCL}" # BE = CCL export DTYPE=${DTYPE:-bf16} # DTYPE: bf16 MICRO_BATCH=${MICRO_BATCH:-4} # MICRO_BATCH = 4 - echo "!!!! Using CPU_OPTIMIZER on Intel XPU by Default !!!!" - export CPU_OPTIMIZER=${CPU_OPTIMIZER:-1} # CPU OPTIMIZER ON INTEL XPU + # if [[ -z "${CPU_OPTIMIZER}" ]]; then + # CPU_OPTIMIZER=1 + # fi + # echo "!!!! Using CPU_OPTIMIZER on Intel XPU by Default !!!!" + # export CPU_OPTIMIZER=${CPU_OPTIMIZER:-1} # CPU OPTIMIZER ON INTEL XPU # -------- [Polaris] ----------------------------------- elif [[ $(hostname) == x3* ]]; then TP=${TP:-2} # TP = 2 @@ -53,8 +56,10 @@ setParams() { MICRO_BATCH=${MICRO_BATCH:-8} # MICRO_BATCH = 8 fi # ------------------------------------------------------------------------ + # export OFFLOAD_DEVICE="${OFFLOAD_DEVICE:-none}" export PP="${PP}" export TP="${TP}" + export OPT="${OPT:-adamw}" export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} # ---- Llama2 7B Config ------------------------------ @@ -87,9 +92,13 @@ setParams() { export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" # if [[ "${CPU_OPTIMIZER:-0}" ]]; then - if [[ -n "${CPU_OPTIMIZER}" ]]; then + # if [[ -n "${CPU_OPTIMIZER}" ]]; then + if [[ "${CPU_OPTIMIZER}" == 1 ]]; then + export OFFLOAD_DEVICE="cpu" echo "\n!!! Appending \`--cpu-optimizer\` to LLAMA_ARGS..." export LLAMA_ARGS="${LLAMA_ARGS} --cpu-optimizer" + else + export OFFLOAD_DEVICE="none" fi # ---------------------------------------------------- } @@ -171,13 +180,15 @@ buildDSconfig() { # ---- Build DeepSpeed Config --------------------------------- export DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" echo "DS_CONFIG: ${DS_CONFIG}" - printf "ZS: %s, MB: %s, GB: %s, PP: %s, DTYPE: %s" ${ZERO_STAGE} ${MICRO_BATCH} ${GLOBAL_BATCH} ${PP} ${DTYPE} - if [[ -z "${CPU_OPTIMIZER}" ]]; then - bash "${PBS_O_WORKDIR}/generate_config.sh" "${DS_CONFIG}" #|| exit 1 - else - echo "!!! Using CPU Optimizer !!!" - bash "${PBS_O_WORKDIR}/generate_config_cpu_optimizer.sh" "${DS_CONFIG}" - fi + printf "ZS: %s, CPU_OPTIMIZER: %s, MB: %s, GB: %s, PP: %s, DTYPE: %s" "${ZERO_STAGE}" "${CPU_OPTIMIZER}" "${MICRO_BATCH}" "${GLOBAL_BATCH}" "${PP}" "${DTYPE}" + bash "${PBS_O_WORKDIR}/generate_config.sh" "${DS_CONFIG}" + # if [[ -z "${CPU_OPTIMIZER}" ]]; then + # echo "!!! Using GPU Optimizer !!!" + # bash "${PBS_O_WORKDIR}/generate_config.sh" "${DS_CONFIG}" #|| exit 1 + # else + # echo "!!! Using CPU Optimizer !!!" + # bash "${PBS_O_WORKDIR}/generate_config_cpu_optimizer.sh" "${DS_CONFIG}" + # fi # ------------------------------------------------------------- } From 29f1e307095188a750da7ba3f97d1697055f2809 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 26 Mar 2024 09:24:46 -0500 Subject: [PATCH 142/268] Update `generate_config.sh` --- generate_config.sh | 75 +++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 28 deletions(-) diff --git a/generate_config.sh b/generate_config.sh index 6bea420a2a..64f0161087 100644 --- a/generate_config.sh +++ b/generate_config.sh @@ -14,22 +14,23 @@ if [ $# -ne 1 ]; then exit 1 fi +# \"optimizer\": { +# \"type\": \"AdamW\", +# \"params\": { +# \"lr\": ${LR}, +# \"beta1\": 0.9, +# \"beta2\": 0.95, +# \"eps\": 1e-5, +# \"weight_decay\": 1e-1 +# } +# }, + extra="" common="\ \"train_batch_size\": $GLOBAL_BATCH, \"train_micro_batch_size_per_gpu\": $MICRO_BATCH, \"steps_per_print\": 1, \"gradient_accumulation_steps\": $GRAD_ACC_STEPS, - \"optimizer\": { - \"type\": \"AdamW\", - \"params\": { - \"lr\": ${LR}, - \"beta1\": 0.9, - \"beta2\": 0.95, - \"eps\": 1e-5, - \"weight_decay\": 1e-1 - } - }, \"scheduler\": { \"type\": \"WarmupLR\", \"params\": { @@ -70,7 +71,7 @@ dtype="\ \"enabled\": true, \"loss_scale\": 1.0 }," -else +elif [[ $DTYPE == "fp16" ]]; then dtype="\ \"communication_data_type\": \"fp16\", \"fp16\": { @@ -84,6 +85,8 @@ dtype="\ \"enabled\": false, \"loss_scale\": 1.0 }," +else + dtype="\"communication_data_type\": \"fp32\"," fi if [ $ZERO_STAGE == 3 ]; then @@ -107,27 +110,43 @@ zero="\ \"pin_memory\": true } }," -elif [ $ZERO_STAGE == 2 ] || [ $ZERO_STAGE == 1 ]; then + +# elif [[ $ZERO_STAGE == 2 ]]; then +elif [ "${ZERO_STAGE}" == 2 ] || [ "${ZERO_STAGE}" == 1 ]; then + +if [[ -n "${CPU_OPTIMIZER}" ]]; then +echo "!!!! CAUGHT CPU_OPTIMIZER !!!!" + +zero="\ + \"zero_optimization\": { + \"stage\": $ZERO_STAGE, + \"offload_optimizer\": { + \"device\": \"cpu\" + } + }," + +else zero="\ \"zero_optimization\": { \"stage\": $ZERO_STAGE }," - if [ $ZERO_STAGE == 1 ]; then - if [ $PP > 1 ]; then - extra="\ - \"data_types\": { - \"grad_accum_dtype\": \"fp32\" - }, - \"comms_logger\": { - \"enabled\": true, - \"verbose\": false, - \"prof_all\": true, - \"debug\": false - }," - else - echo 'please add the config for zero_stage 1 without pipeline-parallelism' - fi - fi +fi + +# elif [[ $ZERO_STAGE == 1 ]]; then +if [[ $PP > 1 ]]; then + extra="\ + \"data_types\": { + \"grad_accum_dtype\": \"fp32\" + }, + \"comms_logger\": { + \"enabled\": true, + \"verbose\": false, + \"prof_all\": true, + \"debug\": false + }," +else + echo 'please add the config for zero_stage 1 without pipeline-parallelism' +fi else echo 'Please add the correct config set!!!' fi From 336332508baeb4e51dc7dfcd480749a13f01b38f Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 26 Mar 2024 09:25:37 -0500 Subject: [PATCH 143/268] `assert args is not None` in `megatron/training.py` --- megatron/training.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/megatron/training.py b/megatron/training.py index e987eb158d..f511efc4ff 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -143,6 +143,8 @@ def pretrain(train_valid_test_dataset_provider, args = get_args() timers = get_timers() + assert args is not None + assert timers is not None if args.deepspeed: args.deepspeed_config_dict = _create_ds_config_dict() @@ -319,6 +321,7 @@ def setup_teacher_model(args, model_provider): def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True): """Build the model.""" args = get_args() + assert args is not None args.model_type = model_type # Build model. From 0dcba4ff4c8262a0fe30412509a22792896d552f Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 26 Mar 2024 11:08:51 -0500 Subject: [PATCH 144/268] Remove `"scheduler": {...}` from `generate_config.sh` --- generate_config.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/generate_config.sh b/generate_config.sh index 64f0161087..e140b7a274 100644 --- a/generate_config.sh +++ b/generate_config.sh @@ -24,6 +24,14 @@ fi # \"weight_decay\": 1e-1 # } # }, +# \"scheduler\": { +# \"type\": \"WarmupLR\", +# \"params\": { +# \"warmup_min_lr\": 0.00003, +# \"warmup_max_lr\": 0.0003, +# \"warmup_num_steps\": 5000 +# } +# }, extra="" common="\ @@ -31,14 +39,6 @@ common="\ \"train_micro_batch_size_per_gpu\": $MICRO_BATCH, \"steps_per_print\": 1, \"gradient_accumulation_steps\": $GRAD_ACC_STEPS, - \"scheduler\": { - \"type\": \"WarmupLR\", - \"params\": { - \"warmup_min_lr\": 0.00003, - \"warmup_max_lr\": 0.0003, - \"warmup_num_steps\": 5000 - } - }, \"zero_allow_untested_optimizer\": true, \"gradient_clipping\": 1.0, \"activation_checkpointing\": { From 367e4ae534090dd6f016c123bab9f04db1b32b67 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 27 Mar 2024 09:33:57 -0500 Subject: [PATCH 145/268] Track optimizer states --- train_llama_alcf_polaris.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/train_llama_alcf_polaris.sh b/train_llama_alcf_polaris.sh index 2e1a23010c..7cc1454f60 100644 --- a/train_llama_alcf_polaris.sh +++ b/train_llama_alcf_polaris.sh @@ -51,6 +51,7 @@ export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit # --launcher_args='--pmi=pmix' # deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} \ # ${launch_cmd} \ + # --optimizer adam \ run_cmd=" deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ --use-flash-attn-v2 \ @@ -66,6 +67,7 @@ run_cmd=" --no-gradient-accumulation-fusion \ --accumulate-allreduce-grads-in-fp32 \ --use-checkpoint-opt_param-scheduler \ + --log-optimizer-states-to-tensorboard \ --lr ${LR} \ --seq-length $SEQ \ --save ${CKPT_DIR} \ From 31644f068b962cf03479d1c8840254921ef240f1 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 27 Mar 2024 09:34:20 -0500 Subject: [PATCH 146/268] Track optimizer states with W&B in `megatron/training.py` --- megatron/training.py | 147 +++++++++++++++++++++++++++---------------- 1 file changed, 93 insertions(+), 54 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index f511efc4ff..bf87e7b52c 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -755,7 +755,6 @@ def train_step(forward_step_func, data_iterator, skipped_iter = 0 grad_norm = None num_zeros_in_grad = None - loss_reduced = {} for key in losses_reduced[0]: losses_reduced_for_key = [x[key] for x in losses_reduced] @@ -793,7 +792,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, args = get_args() timers = get_timers() writer = get_tensorboard_writer() - + wandb_metrics = {} # Advanced, skipped, and Nan iterations. advanced_iters_key = 'advanced iterations' skipped_iters_key = 'skipped iterations' @@ -852,11 +851,15 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, 'optimizer'] # Calculate batch size. - batch_size = args.micro_batch_size * args.data_parallel_size * \ - get_num_microbatches() - - total_iterations = total_loss_dict[advanced_iters_key] + \ - total_loss_dict[skipped_iters_key] + batch_size = ( + args.micro_batch_size + * args.data_parallel_size + * get_num_microbatches() + ) + total_iterations = ( + total_loss_dict[advanced_iters_key] + + total_loss_dict[skipped_iters_key] + ) # Tensorboard values. # Timer requires all the ranks to call. @@ -870,6 +873,10 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, writer.add_scalar('steps-vs-tokens/y=steps,x=tokens', iteration, args.consumed_train_tokens) writer.add_scalar('steps-vs-tokens/y=tokens,x=steps', args.consumed_train_tokens, iteration) if args.log_learning_rate_to_tensorboard: + wandb_metrics |= { + 'learning-rate/iteration': iteration, + 'learning-rate/learning-rate': learning_rate, + } writer.add_scalar('learning-rate/learning-rate', learning_rate, iteration) writer.add_scalar('learning-rate/learning-rate vs samples', learning_rate, args.consumed_train_samples) @@ -881,7 +888,12 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, args.consumed_train_samples) writer.add_scalar('batch-size/batch-size vs tokens', batch_size, args.consumed_train_tokens) + wandb_metrics |= { + 'lm-loss-training/iteration': iteration, + 'lm-loss-training/consumed_train_tokens': args.consumed_train_tokens, + } for key in loss_dict: + wandb_metrics |= {f'lm-loss-training/{key}': loss_dict[key]} writer.add_scalar(f"lm-loss-training/{key}", loss_dict[key], iteration) writer.add_scalar(f"lm-loss-training/{key}" + ' vs samples', loss_dict[key], args.consumed_train_samples) @@ -900,18 +912,21 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, writer.add_scalar('world-size/world-size vs tokens', args.world_size, args.consumed_train_tokens) if grad_norm is not None: + wandb_metrics |= {'training/grad-norm': grad_norm} writer.add_scalar('grad-norm/grad-norm', grad_norm, iteration) writer.add_scalar('grad-norm/grad-norm vs samples', grad_norm, args.consumed_train_samples) writer.add_scalar('grad-norm/grad-norm vs tokens', grad_norm, args.consumed_train_tokens) if num_zeros_in_grad is not None: + wandb_metrics |= {'training/num-zeros': num_zeros_in_grad} writer.add_scalar('num-zeros/num-zeros', num_zeros_in_grad, iteration) writer.add_scalar('num-zeros/num-zeros vs samples', num_zeros_in_grad, args.consumed_train_samples) writer.add_scalar('num-zeros/num-zeros vs tokens', num_zeros_in_grad, args.consumed_train_tokens) if params_norm is not None: + wandb_metrics |= {'training/params-norm': params_norm} writer.add_scalar('params-norm/params-norm', params_norm, iteration) writer.add_scalar('params-norm/params-norm vs samples', params_norm, args.consumed_train_samples) @@ -955,7 +970,6 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, mem_stats["allocation.all.current"], iteration, ) - if iteration % args.tensorboard_log_interval == 0: # This logging write various optimizer states to tensorboard. This # feature may consume extra GPU memory thus is set at false by default. @@ -979,26 +993,49 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, # print('step {} rank {} before sync opt_stats {}, {}'.format(iteration, torch.distributed.get_rank(), opt_stats_2, opt_stats)) if args.zero_stage > 0: # ZeRO partiions optimizer states + # opt_stats = opt_stats.clone().detach() + # opt_stats = get_accelerator().FloatTensor opt_stats = get_accelerator().FloatTensor(opt_stats) torch.distributed.all_reduce(opt_stats, group=mpu.get_sequence_data_parallel_group()) + # opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) + # opt_stats_2 = opt_stats_2.clone().detach() opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX, group=mpu.get_sequence_data_parallel_group()) if args.tensor_model_parallel_size > 1: + # opt_stats = opt_stats.clone().detach() opt_stats = get_accelerator().FloatTensor(opt_stats) torch.distributed.all_reduce(opt_stats, group=mpu.get_tensor_model_parallel_group()) + # opt_stats_2 = opt_stats_2.clone().detach() opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX, group=mpu.get_tensor_model_parallel_group()) if args.pipeline_model_parallel_size > 1: + # opt_stats = opt_stats.clone().detach() opt_stats = get_accelerator().FloatTensor(opt_stats) torch.distributed.all_reduce(opt_stats, group=mpu.get_pipeline_model_parallel_group()) + # opt_stats_2 = opt_stats_2.clone().detach() opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX, group=mpu.get_pipeline_model_parallel_group()) - + wandb_metrics |= { + 'optimizer/iteration': args.iteration, + 'optimizer/consumed_train_tokens': args.consumed_train_tokens, + 'optimizer/variance_l2': opt_stats[0]**0.5, + 'optimizer/variance_sqrt_l2': opt_stats[1]**0.5, + 'optimizer/momentum_l2': opt_stats[2]**0.5, + 'optimizer/weight_l2': opt_stats[3]**0.5, + 'optimizer/variance_l1': opt_stats[4], + 'optimizer/variance_sqrt_l1': opt_stats[5], + 'optimizer/momentum_l1': opt_stats[6], + 'optimizer/weight_l1': opt_stats[7], + 'optimizer/variance_abs_max': opt_stats_2[0], + 'optimizer/variance_sqrt_abs_max': opt_stats_2[1], + 'optimizer/momentum_abs_max': opt_stats_2[2], + 'optimizer/weight_abs_max': opt_stats_2[3], + } # print('step {} rank {} after sync opt_stats {}, {}'.format(iteration, torch.distributed.get_rank(), opt_stats_2, opt_stats)) if writer and is_last_rank(): writer.add_scalar('optimizer/variance_l2 vs tokens', opt_stats[0]**0.5, args.consumed_train_tokens) @@ -1045,27 +1082,24 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, tokens_per_sec_per_replica = tokens_per_sec / args.data_parallel_size tokens_per_gpu_per_second = tokens_per_sec / args.world_size tokens_per_gpu_per_second_per_replica = tokens_per_gpu_per_second / args.data_parallel_size - wandb_metrics = {} - if wandb is not None and getattr(wandb, 'run', None) is not None: - assert wandb.run is not None - wandb_metrics = { - 'throughput/iteration-time': elapsed_time_per_iteration, # 1000 ms / s - 'throughput/samples_per_sec': samples_per_sec, - 'throughput/samples_per_sec_per_replica': samples_per_sec_per_replica, - 'throughput/tokens_per_sec': tokens_per_sec, - 'throughput/tokens_per_sec_per_replica': tokens_per_sec_per_replica, - 'throughput/tokens_per_gpu_per_sec': tokens_per_gpu_per_second, - 'throughput/tokens_per_gpu_per_sec_per_replica': tokens_per_gpu_per_second_per_replica, - 'throughput/tflops': tflops, - 'throughput/approx_params_in_billions': approx_parameters_in_billions, - 'throughput/elapsed_ms_per_iteration': elapsed_time_per_iteration, - 'throughput/iteration': iteration, + wandb_metrics |= { + 'throughput/iteration-time': elapsed_time_per_iteration, # 1000 ms / s + 'throughput/samples_per_sec': samples_per_sec, + 'throughput/samples_per_sec_per_replica': samples_per_sec_per_replica, + 'throughput/tokens_per_sec': tokens_per_sec, + 'throughput/tokens_per_sec_per_replica': tokens_per_sec_per_replica, + 'throughput/tokens_per_gpu_per_sec': tokens_per_gpu_per_second, + 'throughput/tokens_per_gpu_per_sec_per_replica': tokens_per_gpu_per_second_per_replica, + 'throughput/tflops': tflops, + 'throughput/approx_params_in_billions': approx_parameters_in_billions, + 'throughput/elapsed_ms_per_iteration': elapsed_time_per_iteration, + 'throughput/iteration': iteration, + } + if loss_dict is not None: + wandb_metrics |= { + 'loss/iteration': iteration, + **{f'loss/{k}': v for k, v in loss_dict.items()} } - if loss_dict is not None: - wandb_metrics |= { - 'loss/iteration': iteration, - **{f'loss/{k}': v for k, v in loss_dict.items()} - } if writer and args.log_timers_to_tensorboard: writer.add_scalar('iteration-time/iteration-time', elapsed_time_per_iteration, iteration) @@ -1073,31 +1107,36 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, elapsed_time_per_iteration, args.consumed_train_samples) writer.add_scalar('iteration-time/iteration-time vs tokens', elapsed_time_per_iteration, args.consumed_train_tokens) - log_string = ' iteration {:8d}/{:8d} |'.format( - iteration, args.train_iters) - log_string += ' consumed samples: {:12d} |'.format( - args.consumed_train_samples) - log_string += ' consumed tokens: {:12d} |'.format( - args.consumed_train_tokens) - log_string += ' elapsed time per iteration (ms): {:.1f} |'.format( - elapsed_time_per_iteration * 1000.0) - log_string += ' learning rate: {:.3E} |'.format(learning_rate) - log_string += ' global batch size: {:5d} |'.format(batch_size) - if wandb is not None and getattr(wandb, 'run', None) is not None: - wandb_metrics |= { - 'training/iteration': iteration, - 'training/iteration_time': elapsed_time_per_iteration, - 'training/iteration_time_vs_tokens': ( - (elapsed_time_per_iteration - / args.consumed_train_tokens) - ), - 'training/iteration_time_vs_samples': ( - (elapsed_time_per_iteration - / args.consumed_train_samples), - ), - 'training/consumed_samples': args.consumed_train_samples, - 'training/consumed_tokens': args.consumed_train_tokens, - } + log_string = f' iteration {iteration:8d}/{args.train_iters:8d} |' + # .format( iteration, args.train_iters) + log_string += ( + f' consumed samples: {args.consumed_train_samples:12d} |' + # .format(args.consumed_train_samples) + ) + log_string += f' consumed tokens: {args.consumed_train_tokens:12d} |' + # .format( args.consumed_train_tokens) + log_string += ( + ' elapsed time per iteration (ms): ' + f'{elapsed_time_per_iteration * 1000.0:.1f} |' + # .format( elapsed_time_per_iteration * 1000.0) + ) + log_string += f' learning rate: {learning_rate:.3E} |' + log_string += f' global batch size: {batch_size:5d} |' + # if wandb is not None and getattr(wandb, 'run', None) is not None: + wandb_metrics |= { + 'training/iteration': iteration, + 'training/iteration_time': elapsed_time_per_iteration, + 'training/iteration_time_vs_tokens': ( + (elapsed_time_per_iteration + / args.consumed_train_tokens) + ), + 'training/iteration_time_vs_samples': ( + (elapsed_time_per_iteration + / args.consumed_train_samples), + ), + 'training/consumed_samples': args.consumed_train_samples, + 'training/consumed_tokens': args.consumed_train_tokens, + } for key in total_loss_dict: if key not in [advanced_iters_key, skipped_iters_key, nan_iters_key]: From 5273630333e0b4024f50b01f5741dae869182773 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 27 Mar 2024 13:18:34 -0500 Subject: [PATCH 147/268] Update `pretrain_gpt_alcf.py` --- pretrain_gpt_alcf.py | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index e296a637be..af88c83a0f 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -25,6 +25,7 @@ checkpoint_throughput_calculator ) from pathlib import Path +from enrich import get_logger import deepspeed from deepspeed.runtime.utils import see_memory_usage @@ -43,6 +44,7 @@ # backend='deepspeed', # port='5432', # ) +log = get_logger(__name__) RANK = get_rank() WORLD_SIZE = get_world_size() LEVEL = "DEBUG" if RANK == 0 else "CRITICAL" @@ -51,6 +53,10 @@ DISABLE_WANDB = ( WANDB_MODE is not None and str(WANDB_MODE).lower() == 'disabled' ) +if RANK == 0: + log.setLevel("INFO") +else: + log.setLevel("CRITICAL") if RANK == 0 and not DISABLE_WANDB: project_name = ( @@ -74,9 +80,18 @@ def model_provider(pre_process=True, post_process=True): see_memory_usage("Before Building Model", force=True) args = get_args() config = core_transformer_config_from_args(args) - if wandb.run is not None: - print(f"Updating WandB run: [{wandb.run.name}]({wandb.run.url})") - wandb.run.config.update({"args": vars(args)}) + # if wandb.run is not None and RANK == 0: + # print(f"Updating WandB run: [{wandb.run.name}]({wandb.run.url})") + # try: + # wandb.run.config.update({"args": vars(args)}) + # except Exception: + # log.error( + # 'Unable to `wandb.run.config.update({"args": vars(args)})`' + # ) + # if wandb is not None and wandb.run is not None: + # assert wandb is not None and wandb.run is not None + # print(f'Updating {wandb.run.name=} at {wandb.run.url=}') + # wandb.run.config.update({'args': vars(args)}) if RANK == 0: git_ds_info() if hasattr(mpu, 'get_sequence_parallel_group'): @@ -85,10 +100,6 @@ def model_provider(pre_process=True, post_process=True): dpg = mpu.get_data_parallel_group() else: dpg = None - if wandb is not None and wandb.run is not None: - assert wandb is not None and wandb.run is not None - print(f'Updating {wandb.run.name=} at {wandb.run.url=}') - wandb.run.config.update({'args': vars(args)}) with deepspeed.zero.Init( data_parallel_group=dpg, remote_device=( @@ -153,6 +164,18 @@ def model_provider(pre_process=True, post_process=True): see_memory_usage("After Building Model", force=True) if wandb.run is not None: wandb.run.config.update({'num_params': num_params}) + if "args" not in wandb.run.config: + log.info( + f"Updating WandB run.config: [{wandb.run.name}]({wandb.run.get_url()})" + ) + try: + wandb.run.config.update( + {"args": dict(sorted(vars(args).items()))} + ) + except Exception: + log.error( + 'Unable to `wandb.run.config.update({"args": vars(args)})`' + ) try: wandb.run.watch( model, From b4a310ad19aeba318bd2238fc09796a93549ac57 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 27 Mar 2024 13:18:51 -0500 Subject: [PATCH 148/268] Update `train_llama_alcf_sunspot.sh` --- train_llama_alcf_sunspot.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/train_llama_alcf_sunspot.sh b/train_llama_alcf_sunspot.sh index f000506bb9..ea4115aacd 100644 --- a/train_llama_alcf_sunspot.sh +++ b/train_llama_alcf_sunspot.sh @@ -92,6 +92,8 @@ run_cmd=" --no-gradient-accumulation-fusion \ --accumulate-allreduce-grads-in-fp32 \ --use-checkpoint-opt_param-scheduler \ + --log-timers-to-tensorboard \ + --log-optimizer-states-to-tensorboard \ --lr ${LR} \ --save ${CKPT_DIR} \ --load ${CKPT_DIR} \ From 4f7ee536f14601f3febbd18f6f6da9f3c1945ed7 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 27 Mar 2024 13:19:53 -0500 Subject: [PATCH 149/268] Update `megatron/training.py` --- megatron/training.py | 46 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index bf87e7b52c..a0dd659b7c 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -63,26 +63,55 @@ def print_datetime(string): time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print_rank_0('[' + string + '] datetime: {} '.format(time_str)) + +def num_floating_point_operations(args, batch_size): + # Group Query Attention. + # if not args.group_query_attention: + if not args.num_key_value_heads: + args.num_key_value_heads = args.num_attention_heads + # args.num_query_groups = args.num_attention_heads + # MoE. + # num_experts_routed_to = 1 if args.num_experts is None else args.moe_router_topk + num_experts_routed_to = 1 if args.num_experts is None else args.topk + gated_linear_multiplier = 3 / 2 if args.swiglu else 1 + return ( + 12 + * batch_size + * args.seq_length + * args.num_layers + * args.hidden_size + * args.hidden_size + * ( + 1 + + ( + (args.ffn_hidden_size / args.hidden_size) + * num_experts_routed_to + * gated_linear_multiplier + ) + + (args.num_key_value_heads / args.num_attention_heads) + + (args.seq_length / args.hidden_size) + + (args.padded_vocab_size / (2 * args.num_layers * args.hidden_size)) + ) + ) + ''' Since v0.9.0, deepspeed.initialize() has forbidden simultaneous setting of args.deepspeed_config (Path) and ds_config dict. So, we use ds_config dict which is the more flexible option. ''' def _create_ds_config_dict(): args = get_args() + assert args is not None if isinstance(args.deepspeed_config, dict) : ds_config_dict = args.deepspeed_config else: with open(args.deepspeed_config, 'r', encoding='utf-8') as config_file: ds_config_dict = json.load(config_file) - if args.universal_checkpoint: ds_config_dict["checkpoint"] = {"load_universal": True} - # Clear config path - args.deepspeed_config = None - + args.deepspeed_config = None return ds_config_dict - + def pretrain(train_valid_test_dataset_provider, model_provider, @@ -1021,6 +1050,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX, group=mpu.get_pipeline_model_parallel_group()) wandb_metrics |= { + 'optimizer/learning_rate': learning_rate, 'optimizer/iteration': args.iteration, 'optimizer/consumed_train_tokens': args.consumed_train_tokens, 'optimizer/variance_l2': opt_stats[0]**0.5, @@ -1077,6 +1107,10 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, elapsed_time, total_iterations ) + num_flops = num_floating_point_operations(args, batch_size) + # throughput = ( + # num_floating_point_operations_so_far - arg + # ) samples_per_sec_per_replica = samples_per_sec / args.data_parallel_size tokens_per_sec = samples_per_sec * seq_len tokens_per_sec_per_replica = tokens_per_sec / args.data_parallel_size @@ -1091,6 +1125,8 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, 'throughput/tokens_per_gpu_per_sec': tokens_per_gpu_per_second, 'throughput/tokens_per_gpu_per_sec_per_replica': tokens_per_gpu_per_second_per_replica, 'throughput/tflops': tflops, + 'throughput/flops': num_flops, + 'throughput/tflops-new': num_flops / elapsed_time_per_iteration, 'throughput/approx_params_in_billions': approx_parameters_in_billions, 'throughput/elapsed_ms_per_iteration': elapsed_time_per_iteration, 'throughput/iteration': iteration, From 2f9cf05ce56149d3d492914e1c3fe1fd11a5a209 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 27 Mar 2024 13:33:11 -0500 Subject: [PATCH 150/268] Update `train_llama_alcf_{polaris,sunspot}.sh` --- train_llama_alcf_polaris.sh | 13 +++++++++---- train_llama_alcf_sunspot.sh | 5 +++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/train_llama_alcf_polaris.sh b/train_llama_alcf_polaris.sh index 7cc1454f60..aae931fdba 100644 --- a/train_llama_alcf_polaris.sh +++ b/train_llama_alcf_polaris.sh @@ -45,6 +45,8 @@ custom_args=" $@" # Assert `./hostfile_deepspeed` exists export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit +TBDIR="${CKPT_DIR}/tensorboard" +mkdir -p "${TBDIR}" # source "${HERE}/venvs/polaris/2024-03-14/bin/activate" || exit # echo "Using $(which python3)" @@ -52,10 +54,11 @@ export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit # deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} \ # ${launch_cmd} \ # --optimizer adam \ + # --use-flash-attn-v2 \ run_cmd=" deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ - --use-flash-attn-v2 \ --$DTYPE \ + --optimizer ${OPT} \ --num-workers 0 \ --split 100,0,0 \ --log-interval 1 \ @@ -67,11 +70,13 @@ run_cmd=" --no-gradient-accumulation-fusion \ --accumulate-allreduce-grads-in-fp32 \ --use-checkpoint-opt_param-scheduler \ + --tensorboard-dir ${TBDIR} \ + --log-timers-to-tensorboard \ --log-optimizer-states-to-tensorboard \ --lr ${LR} \ - --seq-length $SEQ \ --save ${CKPT_DIR} \ --load ${CKPT_DIR} \ + --seq-length ${SEQ} \ --num-layers ${NLAYERS} \ --hidden-size ${HIDDEN} \ --train-iters ${TRAIN_ITER} \ @@ -98,8 +103,8 @@ run_cmd=" " -echo "All DeepSpeed(s): $(which -a deepspeed)" -echo "Using $(which deepspeed)" +# echo "All DeepSpeed(s): $(which -a deepspeed)" +echo "! Using $(which deepspeed)" ds_report echo "${run_cmd}" diff --git a/train_llama_alcf_sunspot.sh b/train_llama_alcf_sunspot.sh index ea4115aacd..700f5dbfdb 100644 --- a/train_llama_alcf_sunspot.sh +++ b/train_llama_alcf_sunspot.sh @@ -72,6 +72,9 @@ custom_args=" $@" # Assert `./hostfile_deepspeed` exists export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit +TBDIR="${CKPT_DIR}/tensorboard" +mkdir -p "${TBDIR}" + # --use-flash-attn-v2 \ # --use-flash-attn \ # --$DTYPE \ @@ -80,6 +83,7 @@ export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit # --adam-beta2 0.95 \ run_cmd=" deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ + --${DTYPE} \ --optimizer ${OPT} \ --num-workers 0 \ --split 100,0,0 \ @@ -92,6 +96,7 @@ run_cmd=" --no-gradient-accumulation-fusion \ --accumulate-allreduce-grads-in-fp32 \ --use-checkpoint-opt_param-scheduler \ + --tensorboard-dir ${TBDIR} \ --log-timers-to-tensorboard \ --log-optimizer-states-to-tensorboard \ --lr ${LR} \ From 5b9ad9a91d71c9b0573e6ba678ff1d2355a587f1 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 1 Apr 2024 23:16:47 -0500 Subject: [PATCH 151/268] Add support for `--optimizer={apex.adam,apex.sgd,adamw,adam,sgd}` --- megatron/optimizer/__init__.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index e7400c39b5..cdcc344541 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -1,12 +1,6 @@ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. from deepspeed.accelerator import get_accelerator -# if get_accelerator().device_name() == 'cuda': -# from apex.optimizers import FusedAdam as Adam -# from apex.optimizers import FusedSGD as SGD -# else: -# from torch.optim import Adam -# from torch.optim import SGD import torch from megatron import get_args @@ -94,7 +88,25 @@ def get_megatron_optimizer(model, betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps) else: - if str(args.optimizer).lower() == 'adamw': + if str(args.optimizer).lower() == 'apex.adam': + assert get_accelerator().device_name() == 'cuda' + from apex.optimizers import FusedAdam as Adam + optimizer = Adam( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps + ) + elif str(args.optimizer).lower() == 'apex.sgd': + from apex.optimizers import FusedSGD as SGD + optimizer = SGD( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + momentum=args.sgd_momentum + ) + elif str(args.optimizer).lower() == 'adamw': optimizer = torch.optim.AdamW( param_groups, lr=args.lr, @@ -104,7 +116,7 @@ def get_megatron_optimizer(model, ) elif args.optimizer == 'adam': if args.ds_fused_adam: - global Adam + # global Adam from deepspeed.ops.adam import FusedAdam Adam = FusedAdam else: @@ -124,8 +136,7 @@ def get_megatron_optimizer(model, momentum=args.sgd_momentum ) else: - raise Exception(f'{args.optimizer} optimizer is not supported.') - + raise TypeError(f'{args.optimizer} optimizer is not supported.') if args.deepspeed: return optimizer From 294d81f2678f99a747f2bb30238a574e9e6fd5e1 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 1 Apr 2024 23:17:17 -0500 Subject: [PATCH 152/268] Add support for \`--optimizer={apex.adam,apex.sgd,adamw,adam,sgd}\` - Updates: - `megatron/arguments.py` --- megatron/arguments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index cd618b285e..c26bac6fe9 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -914,7 +914,7 @@ def _add_training_args(parser): help='Disable bias in the linear layers', dest='add_bias_linear') group.add_argument('--optimizer', type=str, default='adam', - choices=['adam', 'adamw', 'sgd'], + choices=['adam', 'adamw', 'sgd', 'apex.adam', 'apex.sgd'], help='Optimizer function') group.add_argument('--dataloader-type', type=str, default=None, choices=['single', 'cyclic'], From bed55a033e79bd98e9ffe917dda78926a43c33d1 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 4 Apr 2024 08:13:21 -0500 Subject: [PATCH 153/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 30d9add286..27e2a65d45 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -80,26 +80,21 @@ setParams() { export EVAL_INTERVAL="${EVAL_INTERVAL:-50000}" export SAVE_INTERVAL=${SAVE_INTERVAL:-200} export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} - # export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} - # export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) export GLOBAL_BATCH_MAX=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) export GLOBAL_BATCH="${GLOBAL_BATCH:-${GLOBAL_BATCH_MAX}}" tm="${PBS_O_WORKDIR}/ALCF/tokenizer.model" - # tm_a=/home/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/tokenizer.model - # tm_p="/eagle/datasets/dolma/utils/tokenizer.model" - # export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm_p:-${tm_a}}}" export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" - # if [[ "${CPU_OPTIMIZER:-0}" ]]; then - # if [[ -n "${CPU_OPTIMIZER}" ]]; then - if [[ "${CPU_OPTIMIZER}" == 1 ]]; then - export OFFLOAD_DEVICE="cpu" - echo "\n!!! Appending \`--cpu-optimizer\` to LLAMA_ARGS..." - export LLAMA_ARGS="${LLAMA_ARGS} --cpu-optimizer" - else - export OFFLOAD_DEVICE="none" - fi + # # if [[ "${CPU_OPTIMIZER:-0}" ]]; then + # # if [[ -n "${CPU_OPTIMIZER}" ]]; then + # if [[ "${CPU_OPTIMIZER}" == 1 ]]; then + # export OFFLOAD_DEVICE="cpu" + # echo "\n!!! Appending \`--cpu-optimizer\` to LLAMA_ARGS..." + # export LLAMA_ARGS="${LLAMA_ARGS} --cpu-optimizer" + # else + # export OFFLOAD_DEVICE="none" + # fi # ---------------------------------------------------- } From fceb3738351b08671053d2d87cba02a256e7c748 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 4 Apr 2024 08:37:21 -0500 Subject: [PATCH 154/268] Update `megatron/data/data_samplers.py` --- megatron/data/data_samplers.py | 89 +++++++++++++++++++++++----------- 1 file changed, 61 insertions(+), 28 deletions(-) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 0aae13abce..b242101b3a 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -2,7 +2,6 @@ """Dataloaders.""" - import random import torch import numpy as np @@ -46,7 +45,10 @@ def build_pretraining_data_loader(dataset, consumed_samples): batch_sampler=batch_sampler, num_workers=args.num_workers, pin_memory=True, - multiprocessing_context=args.multiprocessing_context + multiprocessing_context=( + args.multiprocessing_context if args.num_workers > 0 + else None + ) ) if args.repeated_dataloader: loader=RepeatingLoader(loader) @@ -54,28 +56,39 @@ def build_pretraining_data_loader(dataset, consumed_samples): class MegatronPretrainingSampler: - def __init__(self, total_samples, consumed_samples, micro_batch_size, - data_parallel_rank, data_parallel_size, drop_last=True): + def __init__( + self, + total_samples, + consumed_samples, + micro_batch_size, + data_parallel_rank, + data_parallel_size, + drop_last=True + ): # Keep a copy of input params for later use. self.total_samples = total_samples self.consumed_samples = consumed_samples self.micro_batch_size = micro_batch_size self.data_parallel_rank = data_parallel_rank - self.micro_batch_times_data_parallel_size = \ + self.micro_batch_times_data_parallel_size = ( self.micro_batch_size * data_parallel_size + ) self.drop_last = drop_last # Sanity checks. - assert self.total_samples > 0, \ - 'no sample to consume: {}'.format(self.total_samples) - assert self.consumed_samples < self.total_samples, \ - 'no samples left to consume: {}, {}'.format(self.consumed_samples, - self.total_samples) + assert self.total_samples > 0, ( + f'no sample to consume: {self.total_samples}' + ) + assert self.consumed_samples < self.total_samples, ( + 'no samples left to consume: ' + f'{self.consumed_samples}, {self.total_samples}' + ) assert self.micro_batch_size > 0 assert data_parallel_size > 0 - assert self.data_parallel_rank < data_parallel_size, \ - 'data_parallel_rank should be smaller than data size: {}, ' \ - '{}'.format(self.data_parallel_rank, data_parallel_size) + assert self.data_parallel_rank < data_parallel_size, ( + f'data_parallel_rank should be smaller than data size: ' + f'{self.data_parallel_rank}, {data_parallel_size}' + ) def __len__(self): return self.total_samples @@ -125,8 +138,16 @@ def __getitem__(self, idx): class MegatronPretrainingRandomSampler: - def __init__(self, dataset, total_samples, consumed_samples, micro_batch_size, - data_parallel_rank, data_parallel_size, data_sharding): + def __init__( + self, + dataset, + total_samples, + consumed_samples, + micro_batch_size, + data_parallel_rank, + data_parallel_size, + data_sharding + ): # Keep a copy of input params for later use. self.dataset = dataset self.total_samples = total_samples @@ -135,19 +156,23 @@ def __init__(self, dataset, total_samples, consumed_samples, micro_batch_size, self.data_parallel_rank = data_parallel_rank self.data_parallel_size = data_parallel_size self.data_sharding = data_sharding - self.micro_batch_times_data_parallel_size = \ + self.micro_batch_times_data_parallel_size = ( self.micro_batch_size * data_parallel_size - self.last_batch_size = \ + ) + self.last_batch_size = ( self.total_samples % self.micro_batch_times_data_parallel_size + ) # Sanity checks. - assert self.total_samples > 0, \ - 'no sample to consume: {}'.format(self.total_samples) + assert self.total_samples > 0, ( + f'no sample to consume: {self.total_samples}' + ) assert self.micro_batch_size > 0 assert data_parallel_size > 0 - assert self.data_parallel_rank < data_parallel_size, \ - 'data_parallel_rank should be smaller than data size: {}, ' \ - '{}'.format(self.data_parallel_rank, data_parallel_size) + assert self.data_parallel_rank < data_parallel_size, ( + f'data_parallel_rank should be smaller than data size: ' + f'{self.data_parallel_rank}, {data_parallel_size}' + ) def __len__(self): return self.total_samples @@ -163,23 +188,31 @@ def __iter__(self): # data sharding and random sampling if self.data_sharding: - bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \ - * self.micro_batch_size + bucket_size = ( + self.micro_batch_size * ( + self.total_samples + // self.micro_batch_times_data_parallel_size + ) + ) bucket_offset = current_epoch_samples // self.data_parallel_size start_idx = self.data_parallel_rank * bucket_size - g = torch.Generator() g.manual_seed(self.epoch) random_idx = torch.randperm(bucket_size, generator=g).tolist() idx_range = [start_idx + x for x in random_idx[bucket_offset:]] else: - full_bucket_size = (self.total_samples // self.micro_batch_size) \ - * self.micro_batch_size + full_bucket_size = ( + self.micro_batch_size * ( + self.total_samples + // self.micro_batch_size + ) + ) full_bucket_offset = current_epoch_samples g = torch.Generator() g.manual_seed(self.epoch) - idx_range_total = \ + idx_range_total = ( torch.randperm(full_bucket_size, generator=g).tolist() + ) idx_range_active = idx_range_total[full_bucket_offset:] idx_range = idx_range_active[self.data_parallel_rank::self.data_parallel_size] From 07bb7bf87a7710a114645840de8ffe60dd854467 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 4 Apr 2024 08:40:39 -0500 Subject: [PATCH 155/268] Add `train_llama_alcf.sh` --- train_llama_alcf.sh | 112 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 train_llama_alcf.sh diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh new file mode 100644 index 0000000000..0a2e7e5643 --- /dev/null +++ b/train_llama_alcf.sh @@ -0,0 +1,112 @@ +#!/bin/bash --login +#PBS -l walltime=06:00:00 +#PBS -A argonne_tpc +#PBS -q prod +#PBS -l select=48 +#PBS -l filesystems=eagle:home + +function sourceFile() { + fp="$1" + echo "source-ing ${fp}" + if [[ -f "${fp}" ]]; then + # shellcheck source="${fp}" + source "${fp}" + else + echo "ERROR: UNABLE TO SOURCE ${fp}" + fi +} + +# ----[0. Navigate into `$PBS_O_WORKDIR`]------------------------------------- +cd "${PBS_O_WORKDIR}" || exit +HERE=$(python3 -c 'import os; print(os.getcwd())') +export HERE +# ----[1. Assert `./pretrain_gpt_alcf.py` exists:]----------------------------- +export EXEC="${HERE}/pretrain_gpt_alcf.py" +[ -f "${EXEC}" ] || exit +# ----[2. `source ./ALCF/helpers_alcf.sh`:]------------------------------------ +sourceFile "${HERE}/ALCF/helpers.sh" || exit +# ----[3. Call fns from `./ALCF/helpers_alcf.sh`]------------------------------ +setEnv || exit # 1. load `conda` environment +saveDSenv || exit # 2. save env vars to `.deepspeed_env` +ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars +makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` +setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` +buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ +setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} +setArgs || exit # 8. specify additional `deepspeed` arguments +setData "${DATA_FILE_LIST}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset +setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` +printJobInfo || exit # 11. print job info +# ----------------------------------------------------------------------------- + +# Take custom args +custom_args=" $@" + +# Assert `./hostfile_deepspeed` exists +export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit +TBDIR="${CKPT_DIR}/tensorboard" +mkdir -p "${TBDIR}" + +# source "${HERE}/venvs/polaris/2024-03-14/bin/activate" || exit +# echo "Using $(which python3)" +# --launcher_args='--pmi=pmix' + # deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} \ + # ${launch_cmd} \ + # --optimizer adam \ + # --use-flash-attn-v2 \ + # --num-workers 0 \ +run_cmd=" + deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ + --$DTYPE \ + --optimizer ${OPT} \ + --split 100,0,0 \ + --log-interval 1 \ + --no-bias-gelu-fusion \ + --lr-decay-style cosine \ + --no-bias-dropout-fusion \ + --no-masked-softmax-fusion \ + --tokenizer-type Llama2Tokenizer \ + --no-gradient-accumulation-fusion \ + --accumulate-allreduce-grads-in-fp32 \ + --use-checkpoint-opt_param-scheduler \ + --tensorboard-dir ${TBDIR} \ + --log-timers-to-tensorboard \ + --log-optimizer-states-to-tensorboard \ + --lr ${LR} \ + --save ${CKPT_DIR} \ + --load ${CKPT_DIR} \ + --seq-length ${SEQ} \ + --num-layers ${NLAYERS} \ + --hidden-size ${HIDDEN} \ + --train-iters ${TRAIN_ITER} \ + --eval-iters ${EVAL_ITERS} \ + --distributed-backend ${BE} \ + --num-attention-heads ${HEADS} \ + --save-interval ${SAVE_INTERVAL} \ + --eval-interval ${EVAL_INTERVAL} \ + --max-position-embeddings ${SEQ} \ + --micro-batch-size ${MICRO_BATCH} \ + --data-file-list ${DATA_FILE_LIST} \ + --tensor-model-parallel-size ${TP} \ + --global-batch-size ${GLOBAL_BATCH} \ + --pipeline-model-parallel-size ${PP} \ + --num-key-value-heads ${NUM_KV_HEAD} \ + --data-cache-path ${DATA_CACHE_PATH} \ + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + --tokenizer-model ${TOKENIZER_MODEL} \ + ${LLAMA_ARGS} \ + $ds_args \ + ${gpt_args[*]} \ + $custom_args \ + |& tee ${OUTPUT_LOG} + " + +echo "! Using $(which deepspeed)" +ds_report + +echo "${run_cmd}" + +printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" +printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" +eval "${run_cmd}" +set +x From 3c1cdb4c31bdde9503f0fe9f0a7869704b52bc2b Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 4 Apr 2024 08:41:18 -0500 Subject: [PATCH 156/268] Update `megatron/global_vars.py` --- megatron/global_vars.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/megatron/global_vars.py b/megatron/global_vars.py index cb284b3c34..9f833fbd19 100644 --- a/megatron/global_vars.py +++ b/megatron/global_vars.py @@ -96,7 +96,7 @@ def set_global_variables(args): if args.exit_signal_handler: _set_signal_handler() - + def set_args(args): global _GLOBAL_ARGS @@ -137,11 +137,10 @@ def _set_tensorboard_writer(args): global _GLOBAL_TENSORBOARD_WRITER _ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER, 'tensorboard writer') - if hasattr(args, 'tensorboard_dir') and \ args.tensorboard_dir and args.rank == (args.world_size - 1): try: - from torch.utils.tensorboard import SummaryWriter + from torch.utils.tensorboard.writer import SummaryWriter print('> setting tensorboard ...') _GLOBAL_TENSORBOARD_WRITER = SummaryWriter( log_dir=args.tensorboard_dir, @@ -179,9 +178,9 @@ def _set_timers(args): def _ensure_var_is_initialized(var, name): """Make sure the input variable is not None.""" - assert var is not None, '{} is not initialized.'.format(name) + assert var is not None, f'{name} is not initialized.' def _ensure_var_is_not_initialized(var, name): """Make sure the input variable is not None.""" - assert var is None, '{} is already initialized.'.format(name) + assert var is None, f'{name} is already initialized.' From 5fe64ac06fa31a735d146b1154e82665850f92f5 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 4 Apr 2024 08:45:55 -0500 Subject: [PATCH 157/268] Turn on flops profiler in `generate_config.sh` --- generate_config.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/generate_config.sh b/generate_config.sh index e140b7a274..d5b1264219 100644 --- a/generate_config.sh +++ b/generate_config.sh @@ -49,7 +49,7 @@ common="\ flops_profiler="\ \"flops_profiler\": { - \"enabled\": false, + \"enabled\": true, \"profile_step\": 45, \"module_depth\": -1, \"top_modules\": 1, @@ -94,6 +94,8 @@ zero="\ \"zero_optimization\": { \"stage\": 3, \"reduce_scatter\": false, + \"mics_shard_size\": 4, + \"mics_hierarchical_params_gather\": true, \"stage3_max_live_parameters\": 3e9, \"stage3_max_reuse_distance\": 3e9, \"stage3_param_persistence_threshold\": 1e5, @@ -145,7 +147,14 @@ if [[ $PP > 1 ]]; then \"debug\": false }," else - echo 'please add the config for zero_stage 1 without pipeline-parallelism' + # echo 'please add the config for zero_stage 1 without pipeline-parallelism' + extra="\ + \"comms_logger\": { + \"enabled\": true, + \"verbose\": false, + \"prof_all\": true, + \"debug\": false + }," fi else echo 'Please add the correct config set!!!' From 316fd938f076ef7e75ba692c2ac4ca0fa99bc857 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 4 Apr 2024 08:46:31 -0500 Subject: [PATCH 158/268] Update `megatron/model/language_model.py` --- megatron/model/language_model.py | 132 +++++++++++++++++++++---------- 1 file changed, 89 insertions(+), 43 deletions(-) diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py index ec2ae1877a..ceef5be725 100644 --- a/megatron/model/language_model.py +++ b/megatron/model/language_model.py @@ -389,10 +389,16 @@ def __init__(self, post_process=True, num_experts=[1]): args = get_args() - # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5. - if args.untie_embeddings_and_output_weights: assert not add_decoder - super(TransformerLanguageModel, self).__init__(share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights) - + # TODO: passing `share_embeddings_and_output_weights=False` + # will not work correctly for T5 and embeddings will not be synced. + # Fix later for T5. + if args.untie_embeddings_and_output_weights: + assert not add_decoder + super(TransformerLanguageModel, self).__init__( + share_embeddings_and_output_weights=( + not args.untie_embeddings_and_output_weights + ) + ) self.pre_process = pre_process self.post_process = post_process self.hidden_size = config.hidden_size @@ -405,27 +411,35 @@ def __init__(self, self.add_pooler = add_pooler self.encoder_hidden_state = None self.add_retriever = args.retro_add_retriever - self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights + self.untie_embeddings_and_output_weights = ( + args.untie_embeddings_and_output_weights + ) self.num_experts = num_experts # Embeddings. if self.pre_process: - self.embedding = Embedding(self.hidden_size, - args.padded_vocab_size, - args.max_position_embeddings, - args.hidden_dropout, - config, - self.num_tokentypes, - args.embedding_weights_in_fp32) + self.embedding = Embedding( + self.hidden_size, + args.padded_vocab_size, + args.max_position_embeddings, + args.hidden_dropout, + config, + self.num_tokentypes, + args.embedding_weights_in_fp32 + ) self._embedding_key = 'embedding' # Rotary positional embeddings - self.use_rotary_position_embeddings = \ - args.use_rotary_position_embeddings + self.use_rotary_position_embeddings = ( + args.use_rotary_position_embeddings + ) if args.use_rotary_position_embeddings: self.seq_length = args.seq_length - rotary_dim = args.hidden_size // args.num_attention_heads \ - if args.kv_channels is None else args.kv_channels + rotary_dim = ( + args.hidden_size // args.num_attention_heads + if args.kv_channels is None + else args.kv_channels + ) if args.rotary_percent < 1.0: rotary_dim = int(rotary_dim * args.rotary_percent) @@ -433,15 +447,22 @@ def __init__(self, # partial rotary embeddings, which is better than full rotary # Wang and Komatsuzaki et al # https://github.com/kingoflolz/mesh-transformer-jax/ - self.rotary_pos_emb = RotaryEmbedding(rotary_dim, theta=args.rope_theta) + self.rotary_pos_emb = RotaryEmbedding( + rotary_dim, + theta=args.rope_theta + ) # Encoder (usually set to True, False if part of an encoder-decoder # architecture and in encoder-only stage). if self.add_encoder: self.encoder = ParallelTransformer( config, - model_type=args.model_type if not args.retro_add_retriever \ - else ModelType.retro_decoder, + # args.model_type if not args.retro_add_retriever + # else ModelType.retro_decoder + model_type=( + ModelType.retro_decoder if args.retro_add_retriever + else args.model_type + ), self_attn_mask_type=self.encoder_attn_mask_type, pre_process=self.pre_process, post_process=self.post_process, @@ -461,7 +482,8 @@ def __init__(self, self_attn_mask_type=self.decoder_attn_mask_type, pre_process=self.pre_process, post_process=self.post_process, - num_experts=self.num_experts) + num_experts=self.num_experts + ) self._decoder_key = 'decoder' else: self.decoder = None @@ -478,24 +500,30 @@ def __init__(self, args.padded_vocab_size, config=config, init_method=self.init_method, - bias=False) # Setting bias to False always to keep it consistent with embedding tying that also does not have a bias. + # Setting bias to False always to keep it consistent with + # embedding tying that also does not have a bias. + bias=False + ) self._output_layer_key = 'output_layer' def set_input_tensor(self, input_tensor): """ See megatron.model.transformer.set_input_tensor()""" - # This is usually handled in schedules.py but some inference code still # gives us non-lists or None if not isinstance(input_tensor, list): input_tensor = [input_tensor] if self.add_encoder and self.add_decoder: - assert len(input_tensor) == 1, \ - 'input_tensor should only be length 1 for stage with both encoder and decoder' + assert len(input_tensor) == 1, ( + 'input_tensor should only be length 1 ' + 'for stage with both encoder and decoder' + ) self.encoder.set_input_tensor(input_tensor[0]) elif self.add_encoder: - assert len(input_tensor) == 1, \ - 'input_tensor should only be length 1 for stage with only encoder' + assert len(input_tensor) == 1, ( + 'input_tensor should only be length 1 ' + 'for stage with only encoder' + ) self.encoder.set_input_tensor(input_tensor[0]) elif self.add_decoder: if len(input_tensor) == 2: @@ -505,32 +533,50 @@ def set_input_tensor(self, input_tensor): self.decoder.set_input_tensor(None) self.encoder_hidden_state = input_tensor[0] else: - raise Exception('input_tensor must have either length 1 or 2') + raise Exception( + 'input_tensor must have either length 1 or 2' + ) else: - raise Exception('Stage must have at least either encoder or decoder') - - def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask, - dec_input_ids=None, dec_position_ids=None, dec_attn_mask=None, - retriever_input_ids=None, - retriever_position_ids=None, - retriever_attn_mask=None, - enc_dec_attn_mask=None, tokentype_ids=None, - inference_params=None, - pooling_sequence_index=0, - enc_hidden_states=None, output_enc_hidden=False): + raise Exception( + 'Stage must have at least either encoder or decoder' + ) + + def forward( + self, + enc_input_ids, + enc_position_ids, + enc_attn_mask, + dec_input_ids=None, + dec_position_ids=None, + dec_attn_mask=None, + retriever_input_ids=None, + retriever_position_ids=None, + retriever_attn_mask=None, + enc_dec_attn_mask=None, + tokentype_ids=None, + inference_params=None, + pooling_sequence_index=0, + enc_hidden_states=None, + output_enc_hidden=False + ): args = get_args() # Encoder embedding. if self.pre_process: - encoder_input = self.embedding(enc_input_ids, enc_position_ids, - tokentype_ids=tokentype_ids) + encoder_input = self.embedding( + enc_input_ids, + enc_position_ids, + tokentype_ids=tokentype_ids + ) else: encoder_input = None # Retriever embedding. if self.add_retriever and self.pre_process: - retriever_input = self.embedding(retriever_input_ids, - retriever_position_ids, - tokentype_ids=tokentype_ids) + retriever_input = self.embedding( + retriever_input_ids, + retriever_position_ids, + tokentype_ids=tokentype_ids + ) else: retriever_input = None From 69bb53e70f6ce27233b24e39633adde9f94c6e89 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 4 Apr 2024 08:47:03 -0500 Subject: [PATCH 159/268] Remove `--num-workers 0` in `train_llama_alcf_polaris.sh` --- train_llama_alcf_polaris.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_llama_alcf_polaris.sh b/train_llama_alcf_polaris.sh index aae931fdba..06f268f10e 100644 --- a/train_llama_alcf_polaris.sh +++ b/train_llama_alcf_polaris.sh @@ -55,11 +55,11 @@ mkdir -p "${TBDIR}" # ${launch_cmd} \ # --optimizer adam \ # --use-flash-attn-v2 \ + # --num-workers 0 \ run_cmd=" deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ --$DTYPE \ --optimizer ${OPT} \ - --num-workers 0 \ --split 100,0,0 \ --log-interval 1 \ --no-bias-gelu-fusion \ From 2a36f142296bd4a3da3cccdb50cdc22dc6786663 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 4 Apr 2024 08:54:18 -0500 Subject: [PATCH 160/268] Update `megatron/timers.py` --- megatron/timers.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/megatron/timers.py b/megatron/timers.py index 384c7c37a3..870ba8996f 100644 --- a/megatron/timers.py +++ b/megatron/timers.py @@ -8,8 +8,14 @@ import torch from deepspeed.accelerator import get_accelerator +from tensorboard.summary import Writer from packaging import version +try: + import wandb +except Exception: + wandb = None + class TimerBase(ABC): @@ -292,8 +298,15 @@ def log(self, names, rank=None, normalizer=1.0, reset=True, barrier=False): print(output_string, flush=True) - def write(self, names, writer, iteration, normalizer=1.0, - reset=False, barrier=False): + def write( + self, + names: list[str], + writer: Writer, + iteration: int, + normalizer: float = 1.0, + reset: bool = False, + barrier: bool = False + ): """Write timers to a tensorboard writer Note that we only report maximum time across ranks to tensorboard. """ @@ -303,7 +316,16 @@ def write(self, names, writer, iteration, normalizer=1.0, assert normalizer > 0.0 name_to_min_max_time = self._get_global_min_max_time( names, reset, barrier, normalizer) + timer_data = { + 'timers/iteration': iteration, + **{ + f'timers/{k}-time': name_to_min_max_time[k][1] + for k in name_to_min_max_time + } + } + if wandb is not None and getattr(wandb, 'run', None) is not None: + wandb.log(timer_data, commit=False) if writer is not None: for name in name_to_min_max_time: _, max_time = name_to_min_max_time[name] - writer.add_scalar(name + '-time', max_time, iteration) + writer.add_scalar(f'{name}-time', max_time, iteration) From 58b1696fe3e349b9c29a223886e103bf23f5c196 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 4 Apr 2024 07:07:26 -0700 Subject: [PATCH 161/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 71 ++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 36 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 27e2a65d45..d5d7042089 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -12,6 +12,19 @@ printJobInfo() { echo "++++++++++++++++++++++++++++++++++++++++++++++++++" } + +function setupSrun() { + if [[ $(hostname) == login* || $(hostname) == nid* ]]; then + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + export SRUN_EXEC="srun --gpus ${NGPUS} --gpus-per-node ${NGPU_PER_HOST} -N ${NHOSTS} -n ${NGPUS} -l -u --verbose" + else + echo "Skipping setupSrun() on $(hostname)" + fi +} + + function setDSlauncher() { # launcher setting outdir=$1 @@ -35,11 +48,11 @@ setParams() { # -------- [Aurora] ---- || ----- [SunSpot] ------------ if [[ $(hostname) == x4* || $(hostname) == x1* ]]; then TP=${TP:-1} # TP = 1 - PP=${PP:-1} # PP = 1 export CCL=${CCL:-ccl} # CCL export BE="${CCL}" # BE = CCL export DTYPE=${DTYPE:-bf16} # DTYPE: bf16 MICRO_BATCH=${MICRO_BATCH:-4} # MICRO_BATCH = 4 + export WORKING_DIR="${PBS_O_WORKDIR}" # if [[ -z "${CPU_OPTIMIZER}" ]]; then # CPU_OPTIMIZER=1 # fi @@ -48,17 +61,24 @@ setParams() { # -------- [Polaris] ----------------------------------- elif [[ $(hostname) == x3* ]]; then TP=${TP:-2} # TP = 2 - PP=${PP:-1} # PP = 1 export NCCL=${NCCL:-nccl} # NCCL export BE="${NCCL}" # BE = NCCL # export DTYPE=${DTYPE:-bf16} # DTYPE: BF16 ?? export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 MICRO_BATCH=${MICRO_BATCH:-8} # MICRO_BATCH = 8 + export WORKING_DIR="${PBS_O_WORKDIR}" + elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then + TP="${TP:-2}" + export NCCL="${NCCL:-nccl}" + export BE="${CCL}" + export DTYPE="${DTYPE:-bf16}" + MICRO_BATCH="${MICRO_BATCH:-8}" + export WORKING_DIR="${SLURM_SUBMIT_DIR}" fi # ------------------------------------------------------------------------ - # export OFFLOAD_DEVICE="${OFFLOAD_DEVICE:-none}" - export PP="${PP}" export TP="${TP}" + export PP="${PP:-1}" + export DTYPE="${DTYPE:-bf16}" export OPT="${OPT:-adamw}" export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} @@ -82,19 +102,10 @@ setParams() { export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} export GLOBAL_BATCH_MAX=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) export GLOBAL_BATCH="${GLOBAL_BATCH:-${GLOBAL_BATCH_MAX}}" - tm="${PBS_O_WORKDIR}/ALCF/tokenizer.model" + tm="${WORKING_DIR}/ALCF/tokenizer.model" export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" - # # if [[ "${CPU_OPTIMIZER:-0}" ]]; then - # # if [[ -n "${CPU_OPTIMIZER}" ]]; then - # if [[ "${CPU_OPTIMIZER}" == 1 ]]; then - # export OFFLOAD_DEVICE="cpu" - # echo "\n!!! Appending \`--cpu-optimizer\` to LLAMA_ARGS..." - # export LLAMA_ARGS="${LLAMA_ARGS} --cpu-optimizer" - # else - # export OFFLOAD_DEVICE="none" - # fi # ---------------------------------------------------- } @@ -176,14 +187,8 @@ buildDSconfig() { export DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" echo "DS_CONFIG: ${DS_CONFIG}" printf "ZS: %s, CPU_OPTIMIZER: %s, MB: %s, GB: %s, PP: %s, DTYPE: %s" "${ZERO_STAGE}" "${CPU_OPTIMIZER}" "${MICRO_BATCH}" "${GLOBAL_BATCH}" "${PP}" "${DTYPE}" - bash "${PBS_O_WORKDIR}/generate_config.sh" "${DS_CONFIG}" - # if [[ -z "${CPU_OPTIMIZER}" ]]; then - # echo "!!! Using GPU Optimizer !!!" - # bash "${PBS_O_WORKDIR}/generate_config.sh" "${DS_CONFIG}" #|| exit 1 - # else - # echo "!!! Using CPU Optimizer !!!" - # bash "${PBS_O_WORKDIR}/generate_config_cpu_optimizer.sh" "${DS_CONFIG}" - # fi + working_dir="${PBS_O_WORKDIR:-${SLURM_SUBMIT_DIR:-$(pwd)}}" + bash "${working_dir}/generate_config.sh" "${DS_CONFIG}" # ------------------------------------------------------------- } @@ -224,29 +229,21 @@ setEnv() { echo "Running on Polaris !!" # ---- [load conda] --------------------- module load conda/2023-10-04; conda activate cu118-pt221 ; unset PYTHONUSERBASE - # module load conda/2023-10-04 ; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/py311-cu118 - # ; conda activate /lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06 - # export PYTHONUSERBASE="${HOME}/.local/polaris/conda/py311-cu118" - # mkdir -p "${PYTHONUSERBASE}" - # if [[ "${VIRTUAL_ENV}" ]]; then - # echo "Caught VIRTUAL_ENV = ${VIRTUAL_ENV} from environment!!" - # else - # echo "Not using VIRTUAL_ENV" - # # sourceFile "${HERE}/venvs/polaris/2023-10-04/bin/activate" || exit - # fi + elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then + echo "Running on Perlmutter !!" + module load pytorch + source "${SLURM_SUBMIT_DIR}/venvs/perlmutter/pytorch-2.1.0-cu12/bin/activate" else # ------------------------------------- [Unknown] ------------------- echo "Unknown hostname $(hostname)" exit 1 fi + echo "[python] Using: $(which python3)" } makeHostfiles() { - # GPUS_PER_NODE=$(python3 -Wignore -c 'import ezpz; print(ezpz.get_gpus_per_node())') - # source $(python3 -c 'import ezpz; print(ezpz.SAVEJOBENV.as_posix())') || exit - # source $(python3 -c 'import ezpz; print(ezpz.GETJOBENV.as_posix())') || exit source ezpz/src/ezpz/bin/savejobenv || exit #> /tmp/savejobenv.log 2>&1 & source ezpz/src/ezpz/bin/getjobenv || exit - export GPUS_PER_NODE="${GPUS_PER_NODE:-${NGPU_PER_HOST}}" + export GPUS_PER_NODE="${GPUS_PER_NODE:-${NGPU_PER_HOST:-${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}}}" # ---- Make MPICH hostfile ---------------- hf="${HOSTFILE:-${PBS_NODEFILE}}" export hostfile_mpich=hostfile_mpich @@ -264,6 +261,8 @@ setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------------- dfl_fallback="/gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_file_list_reweighted.txt" elif [[ $(hostname) == x3* ]]; then dfl_fallback="/eagle/datasets/dolma/data_file_list_reweighted.txt" + elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then + dfl_fallback="${SLURM_SUBMIT_DIR}/genslm-subsample.txt" else echo "Unknown hostname. Must manually specify DATA_FILE_LIST." fi From 9ac01590afc89016a3cfb699fc5c5d15e42f15a0 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 4 Apr 2024 07:31:56 -0700 Subject: [PATCH 162/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index d5d7042089..c0976bd6de 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -53,11 +53,6 @@ setParams() { export DTYPE=${DTYPE:-bf16} # DTYPE: bf16 MICRO_BATCH=${MICRO_BATCH:-4} # MICRO_BATCH = 4 export WORKING_DIR="${PBS_O_WORKDIR}" - # if [[ -z "${CPU_OPTIMIZER}" ]]; then - # CPU_OPTIMIZER=1 - # fi - # echo "!!!! Using CPU_OPTIMIZER on Intel XPU by Default !!!!" - # export CPU_OPTIMIZER=${CPU_OPTIMIZER:-1} # CPU OPTIMIZER ON INTEL XPU # -------- [Polaris] ----------------------------------- elif [[ $(hostname) == x3* ]]; then TP=${TP:-2} # TP = 2 @@ -67,10 +62,11 @@ setParams() { export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 MICRO_BATCH=${MICRO_BATCH:-8} # MICRO_BATCH = 8 export WORKING_DIR="${PBS_O_WORKDIR}" + # -------- [Perlmutter] --------------------------------- elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then TP="${TP:-2}" export NCCL="${NCCL:-nccl}" - export BE="${CCL}" + export BE="${NCCL}" export DTYPE="${DTYPE:-bf16}" MICRO_BATCH="${MICRO_BATCH:-8}" export WORKING_DIR="${SLURM_SUBMIT_DIR}" From e54063b8267a00c5328dd4a4ef966b13537e6227 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 4 Apr 2024 07:33:04 -0700 Subject: [PATCH 163/268] Add `train_llama_nersc_perlmutter.sh` --- train_llama_nersc_perlmutter.sh | 141 ++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 train_llama_nersc_perlmutter.sh diff --git a/train_llama_nersc_perlmutter.sh b/train_llama_nersc_perlmutter.sh new file mode 100644 index 0000000000..8131579809 --- /dev/null +++ b/train_llama_nersc_perlmutter.sh @@ -0,0 +1,141 @@ +#!/bin/bash --login +#SBATCH -A m4388_g +#SBATCH -C 'gpu&hbm80g' +#SBATCH -q regular +#SBATCH -t 00:30:00 +#SBATCH --nodes 128 +#SBATCH --gpus 512 +# + +function sourceFile() { + fp="$1" + echo "source-ing ${fp}" + if [[ -f "${fp}" ]]; then + # shellcheck source="${fp}" + source "${fp}" + else + echo "ERROR: UNABLE TO SOURCE ${fp}" + fi +} + +# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +# ---- 0. Navigate into `$PBS_O_WORKDIR` ------------------------------------- +# cd "${PBS_O_WORKDIR}" || exit +cd "${SLURM_SUBMIT_DIR}" || exit +HERE=$(python3 -c 'import os; print(os.getcwd())') +export HERE +# dflfb="${HERE}/genslm-subsample.txt" +# ---- 1. Assert `./pretrain_gpt_alcf.py` exists: ----------------------------- +export EXEC="${HERE}/pretrain_gpt_alcf.py" +[ -f "${EXEC}" ] || exit +# ---- 2. `source ./ALCF/helpers_alcf.sh`: ------------------------------------ +sourceFile "${HERE}/ALCF/helpers.sh" || exit +# ---- 3. Call fns from `./ALCF/helpers_alcf.sh` ------------------------------ +setEnv || exit # 1. load `conda` environment +saveDSenv || exit # 2. save env vars to `.deepspeed_env` +ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars +makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` +setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` +buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ +setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} +setArgs || exit # 8. specify additional `deepspeed` arguments +setData "${DATA_FILE_LIST:-${dflfb}}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset +setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` +printJobInfo || exit # 11. print job info +# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +# Take custom args +custom_args=" $@" + +# Assert `./hostfile_deepspeed` exists +export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit +TBDIR="${CKPT_DIR}/tensorboard" +mkdir -p "${TBDIR}" + +# source "${HERE}/venvs/polaris/2024-03-14/bin/activate" || exit +# echo "Using $(which python3)" +# --launcher_args='--pmi=pmix' + # deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} \ + # ${launch_cmd} \ + # --optimizer adam \ + # --use-flash-attn-v2 \ + # deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ +# source ezpz/src/ezpz/bin/getjobenv || exit +# if [[ -z "${DIST_LAUNCH}" ]]; then +# setupSrun || exit +# echo "Using SRUN_EXEC: ${SRUN_EXEC}" +# else +# SRUN_EXEC="${DIST_LAUNCH}" +# fi +# echo "Using SRUN_EXEC: ${SRUN_EXEC}" +# +export NHOSTS="${SLURM_NNODES:-1}" +export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" +export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" +export SRUN_EXEC="srun --gpus ${NGPUS} --gpus-per-node ${NGPU_PER_HOST} -N ${NHOSTS} -n ${NGPUS} -l -u --verbose" + + # srun --gpus ${NGPUS} \ + # --gpus-per-node ${NGPU_PER_HOST} \ + # -N ${NHOSTS} \ + # -n ${NGPUS} \ + # -l -u --verbose python3 ${EXEC} \ +run_cmd=" + ${SRUN_EXEC} python3 ${EXEC} \ + --$DTYPE \ + --optimizer ${OPT} \ + --num-workers 0 \ + --split 100,0,0 \ + --log-interval 1 \ + --no-bias-gelu-fusion \ + --lr-decay-style cosine \ + --no-bias-dropout-fusion \ + --no-masked-softmax-fusion \ + --tokenizer-type Llama2Tokenizer \ + --no-gradient-accumulation-fusion \ + --accumulate-allreduce-grads-in-fp32 \ + --use-checkpoint-opt_param-scheduler \ + --tensorboard-dir ${TBDIR} \ + --log-timers-to-tensorboard \ + --log-optimizer-states-to-tensorboard \ + --lr ${LR} \ + --save ${CKPT_DIR} \ + --load ${CKPT_DIR} \ + --seq-length ${SEQ} \ + --num-layers ${NLAYERS} \ + --hidden-size ${HIDDEN} \ + --train-iters ${TRAIN_ITER} \ + --eval-iters ${EVAL_ITERS} \ + --distributed-backend ${BE} \ + --num-attention-heads ${HEADS} \ + --save-interval ${SAVE_INTERVAL} \ + --eval-interval ${EVAL_INTERVAL} \ + --max-position-embeddings ${SEQ} \ + --micro-batch-size ${MICRO_BATCH} \ + --data-file-list ${DATA_FILE_LIST} \ + --tensor-model-parallel-size ${TP} \ + --global-batch-size ${GLOBAL_BATCH} \ + --pipeline-model-parallel-size ${PP} \ + --num-key-value-heads ${NUM_KV_HEAD} \ + --data-cache-path ${DATA_CACHE_PATH} \ + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + --tokenizer-model ${TOKENIZER_MODEL} \ + ${LLAMA_ARGS} \ + $ds_args \ + ${gpt_args[*]} \ + $custom_args \ + |& tee ${OUTPUT_LOG} + " + +run_cmd=$(echo "${run_cmd}" | sed -e 's/ */ /g') + +# echo "All DeepSpeed(s): $(which -a deepspeed)" +echo "! Using $(which deepspeed)" +ds_report + +echo "${run_cmd}" + +printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" +printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" +# echo "${OUTPUT_LOG}" +eval "${run_cmd}" +set +x From 8ac8bdc651f0fe31c5960294b9f1a21d57b006e8 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 4 Apr 2024 10:01:38 -0500 Subject: [PATCH 164/268] Update `{train_llama_alcf.sh,ALCF/helpers.sh}` --- ALCF/helpers.sh | 3 +++ train_llama_alcf.sh | 10 +++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index c0976bd6de..ee97709ef4 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -102,6 +102,9 @@ setParams() { export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" + if [[ -z "${NO_FLASH_ATTN}" ]]; then + export LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn-v2" + fi # ---------------------------------------------------- } diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index 0a2e7e5643..f0c939b9f7 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -50,11 +50,11 @@ mkdir -p "${TBDIR}" # source "${HERE}/venvs/polaris/2024-03-14/bin/activate" || exit # echo "Using $(which python3)" # --launcher_args='--pmi=pmix' - # deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} \ - # ${launch_cmd} \ - # --optimizer adam \ - # --use-flash-attn-v2 \ - # --num-workers 0 \ +# deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} \ +# ${launch_cmd} \ +# --use-flash-attn-v2 \ +# --num-workers 0 \ + run_cmd=" deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ --$DTYPE \ From 8c6c91f5ecc2eccf87a192cdfe4a86b620165013 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 4 Apr 2024 10:02:01 -0500 Subject: [PATCH 165/268] Update `megatron/training.py` --- megatron/training.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index a0dd659b7c..9b701ea687 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -113,15 +113,17 @@ def _create_ds_config_dict(): return ds_config_dict -def pretrain(train_valid_test_dataset_provider, - model_provider, - model_type, - forward_step_func, - process_non_loss_data_func=None, - extra_args_provider=None, - args_defaults={}, - data_post_process=None, - external_args={}): +def pretrain( + train_valid_test_dataset_provider, + model_provider, + model_type, + forward_step_func, + process_non_loss_data_func=None, + extra_args_provider=None, + args_defaults={}, + data_post_process=None, + external_args={}, +) -> torch.nn.Module: """Main training program. This function will run the followings in the order provided: @@ -149,6 +151,9 @@ def pretrain(train_valid_test_dataset_provider, to it. It is used for programs to add their own arguments. args_defaults: a dictionary from argument-name to argument-value. It to set already parse arguments. + + Returns: + model (torch.nn.Module) """ # Initalize and get arguments, timers, and Tensorboard writer. From 7794fc07fe74fcdecc53fa1061e69d2eb8084e6b Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 4 Apr 2024 10:02:20 -0500 Subject: [PATCH 166/268] Update `pretrain_gpt_alcf.py` --- pretrain_gpt_alcf.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index a91217afa6..f5061bf5b2 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -571,18 +571,21 @@ def main(): model_provider, ModelType.encoder_or_decoder, forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, data_post_process=data_post_process ) - - prof.export_chrome_trace(f"{args.tensorboard_dir}/torch-trace-{RANK}-of-{WORLD_SIZE}.json") + args = get_args() + prof.export_chrome_trace( + f"{args.tensorboard_dir}" + "/torch-trace-{RANK}-of-{WORLD_SIZE}.json" + ) else: model = pretrain( train_valid_test_datasets_provider, model_provider, ModelType.encoder_or_decoder, forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, data_post_process=data_post_process ) return model From 590630e79e2895452a908b73b3f37678484ef773 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 4 Apr 2024 11:13:02 -0500 Subject: [PATCH 167/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index ee97709ef4..bb612def4a 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -44,6 +44,7 @@ function setDSlauncher() { } setParams() { + LLAMA_ARGS="" # ---- [Parallelism Settings] -------------------------------------------- # -------- [Aurora] ---- || ----- [SunSpot] ------------ if [[ $(hostname) == x4* || $(hostname) == x1* ]]; then @@ -53,6 +54,9 @@ setParams() { export DTYPE=${DTYPE:-bf16} # DTYPE: bf16 MICRO_BATCH=${MICRO_BATCH:-4} # MICRO_BATCH = 4 export WORKING_DIR="${PBS_O_WORKDIR}" + if [[ -z "${NO_FLASH_ATTN}" ]]; then + LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn" + fi # -------- [Polaris] ----------------------------------- elif [[ $(hostname) == x3* ]]; then TP=${TP:-2} # TP = 2 @@ -62,6 +66,9 @@ setParams() { export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 MICRO_BATCH=${MICRO_BATCH:-8} # MICRO_BATCH = 8 export WORKING_DIR="${PBS_O_WORKDIR}" + if [[ -z "${NO_FLASH_ATTN}" ]]; then + LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn-v2" + fi # -------- [Perlmutter] --------------------------------- elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then TP="${TP:-2}" @@ -70,6 +77,9 @@ setParams() { export DTYPE="${DTYPE:-bf16}" MICRO_BATCH="${MICRO_BATCH:-8}" export WORKING_DIR="${SLURM_SUBMIT_DIR}" + if [[ -z "${NO_FLASH_ATTN}" ]]; then + LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn-v2" + fi fi # ------------------------------------------------------------------------ export TP="${TP}" @@ -101,10 +111,7 @@ setParams() { tm="${WORKING_DIR}/ALCF/tokenizer.model" export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" - export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" - if [[ -z "${NO_FLASH_ATTN}" ]]; then - export LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn-v2" - fi + export LLAMA_ARGS="${LLAMA_ARGS} --no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" # ---------------------------------------------------- } From d03aac040e93045fe9b68c516e9528676a2c39e3 Mon Sep 17 00:00:00 2001 From: Varuni Sastry <88804132+vksastry@users.noreply.github.com> Date: Mon, 8 Apr 2024 11:19:47 -0500 Subject: [PATCH 168/268] Update README.md with data preprocessing info and links. --- ALCF/README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ALCF/README.md b/ALCF/README.md index 1a8612ed8a..d95c70125d 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -196,3 +196,14 @@ modules and launch # [...] ```
+ +### Data Preprocessing + +AuroraGPT is trained on the Dolma dataset (initially v0), now in the process of moving to v6. For more details on the dataset, refer to https://huggingface.co/datasets/allenai/dolma. The dolma dataset downloaded is already preprocessing to remove the duplicates (dedup) and filtering the data (mixing). For more details refer to https://github.com/allenai/dolma/tree/main/docs and https://github.com/vksastry/dolma_alcf/blob/main/ALCF/Readme.md. + +The data preprocessing of Dolma dataset before training consists of tokenization of the data using a specific tokenizer (LlamaTokenizer is what we are currently using), Use the below script to tokenize the entire dataset. Example shown for Polaris. + +``` bash +cd /eagle/datasets/dolma/utils +./tokenization.sh +``` From c72914f65ad8542eebc612da19a37dc2ad42651f Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 16 Apr 2024 11:59:36 -0500 Subject: [PATCH 169/268] Update `megatron/core/tensor_parallel/cross_entropy.py` --- megatron/core/tensor_parallel/cross_entropy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/core/tensor_parallel/cross_entropy.py b/megatron/core/tensor_parallel/cross_entropy.py index 9dcdc0459f..d0453d25ea 100644 --- a/megatron/core/tensor_parallel/cross_entropy.py +++ b/megatron/core/tensor_parallel/cross_entropy.py @@ -69,14 +69,14 @@ def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0): vocab_size = exp_logits.size(-1) if label_smoothing > 0: - """ + r""" We'd like to assign 1 / (K - 1) probability mass to every index that is not the ground truth. = (1 - alpha) * y_gt + alpha * mean(y_{i for i != gt}) = (1 - alpha) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i = ((K - 1) * (1 - alpha) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i = (K * (1 - alpha) - 1) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i} y_i = (1 - (alpha * K) / (K - 1)) * y_gt + ( (alpha * K) / (K - 1) ) * \sum_{i} y_i / K - From: https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/losses/smoothed_cross_entropy.py + From: """ assert 1.0 > label_smoothing > 0.0 smoothing = label_smoothing * vocab_size / (vocab_size - 1) From 7848cd49c70592432c46910a0efb5d01ace2b430 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 16 Apr 2024 12:00:47 -0500 Subject: [PATCH 170/268] Update `pretrain_gpt_alcf.py` --- pretrain_gpt_alcf.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index f5061bf5b2..46036efbad 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -94,8 +94,8 @@ def model_provider(pre_process=True, post_process=True): # wandb.run.config.update({'args': vars(args)}) if RANK == 0: git_ds_info() - if hasattr(mpu, 'get_sequence_parallel_group'): - dpg = mpu.get_sequence_parallel_group() + if hasattr(mpu, 'get_sequence_data_parallel_group'): + dpg = mpu.get_sequence_data_parallel_group() elif hasattr(mpu, 'get_data_parallel_group'): dpg = mpu.get_data_parallel_group() else: @@ -154,6 +154,7 @@ def model_provider(pre_process=True, post_process=True): pre_process=pre_process, post_process=post_process ) + num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) # print_rank_0('\n ------------------------ ') # print_rank_0(f'num of parameters {num_params}') @@ -588,6 +589,18 @@ def main(): # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, data_post_process=data_post_process ) + try: + from megatron.text_generation import generate_and_post_process + import ezpz as ez + with torch.autocast(device_type=ez.get_torch_device(), dtype=torch.float16): + response, _, _, _ = generate_and_post_process(model, prompts=["Hello world", "Nature is", "Turing test comprises", "Explain solar eclipse"], tokens_to_generate=32) + if RANK == 0: + log.info(f'generation completed..\n response:{response}') + except ValueError as ve: + log.critical(f'ValueError: {ve}') + pass + # dist.barrier() + model.train() return model From f2b82b964beba5eac36be836a2a185a3515baf0a Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 16 Apr 2024 12:01:16 -0500 Subject: [PATCH 171/268] Removes old `train_sbatch_pp64.sh` --- train_sbatch_pp64.sh | 34 ---------------------------------- 1 file changed, 34 deletions(-) delete mode 100755 train_sbatch_pp64.sh diff --git a/train_sbatch_pp64.sh b/train_sbatch_pp64.sh deleted file mode 100755 index b7baf2539e..0000000000 --- a/train_sbatch_pp64.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash --login -#SBATCH -A m3957_g -#SBATCH -C 'gpu&hbm80g' -#SBATCH -q regular -#SBATCH -t 00:30:00 -#SBATCH --nodes 128 -#SBATCH --gpus 512 - - -# TODO:: -# - Add logic for catching / killing hung process at end of run to ensure -# second run starts up (otherwise, it will wait for the hung process, which -# will run until the job is killed) -# - This wll let us try running multiple experiments in a single slurm job -# allocation. -# - Existing (similar implementation) from my `~/bin/kill-match`: -# ```bash -# #!/bin/bash --login -# TO_KILL=$1 -# kill $(ps aux | grep -E "$USER.+($TO_KILL)" | grep -v grep | awk '{print $2}') - - -PPSIZE=64 \ - MODEL_SIZE_KEY="GPT1T_$(( 2 * PPSIZE ))L" \ - SEQ_LEN=2048 \ - MICRO_BATCH=2 \ - GAS=$(( 8 * PPSIZE )) \ - SP_TYPE=megatron \ - ZERO_STAGE=1 \ - USE_SEQUENCE_PARALLEL=0 \ - MPSIZE=8 \ - SPSIZE=1 \ - USE_ACTIVATION_CHECKPOINTING=1 \ - ./ALCF/train-gpt3.sh From 5c3b5b7f98a2e09d75160d03a61a5e67ecba9ed1 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 16 Apr 2024 15:36:24 -0500 Subject: [PATCH 172/268] Update `generate_config.sh` --- generate_config.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generate_config.sh b/generate_config.sh index d5b1264219..b164b5e610 100644 --- a/generate_config.sh +++ b/generate_config.sh @@ -50,7 +50,7 @@ common="\ flops_profiler="\ \"flops_profiler\": { \"enabled\": true, - \"profile_step\": 45, + \"profile_step\": 4, \"module_depth\": -1, \"top_modules\": 1, \"detailed\": true, From 4e5c38321ca8d099b8bdad6fb0d47c54febd483e Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 16 Apr 2024 15:37:56 -0500 Subject: [PATCH 173/268] Add support for `schedulefree.{AdamWScheduleFree,SGDScheduleFree}` --- megatron/arguments.py | 10 +++- megatron/optimizer/__init__.py | 94 +++++++++++++++++++--------------- megatron/training.py | 43 +++++++++++----- 3 files changed, 94 insertions(+), 53 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 994a727ec3..5f9b6f4144 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -916,7 +916,15 @@ def _add_training_args(parser): help='Disable bias in the linear layers', dest='add_bias_linear') group.add_argument('--optimizer', type=str, default='adam', - choices=['adam', 'adamw', 'sgd', 'apex.adam', 'apex.sgd'], + choices=[ + 'adam', + 'adamw', + 'sgd', + 'apex.adam', + 'apex.sgd', + 'adamwschedulefree', + 'sgdschedulefree' + ], help='Optimizer function') group.add_argument('--dataloader-type', type=str, default=None, choices=['single', 'cyclic'], diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index cdcc344541..fdaa1f7dc7 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -87,8 +87,22 @@ def get_megatron_optimizer(model, weight_decay=args.weight_decay, betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps) - else: - if str(args.optimizer).lower() == 'apex.adam': + elif str(args.optimizer).lower() == 'adamwschedulefree': + import schedulefree + optimizer = schedulefree.AdamWScheduleFree( + param_groups, + lr=args.lr, + warmup_steps=args.lr_warmup_iters, + ) + elif str(args.optimizer).lower() == 'sgdschedulefree': + import schedulefree + optimizer = schedulefree.SGDScheduleFree( + param_groups, + lr=args.lr, + warmup_steps=args.lr_warmup_iters, + ) + # else: + elif str(args.optimizer).lower() == 'apex.adam': assert get_accelerator().device_name() == 'cuda' from apex.optimizers import FusedAdam as Adam optimizer = Adam( @@ -98,45 +112,45 @@ def get_megatron_optimizer(model, betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps ) - elif str(args.optimizer).lower() == 'apex.sgd': - from apex.optimizers import FusedSGD as SGD - optimizer = SGD( - param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - momentum=args.sgd_momentum - ) - elif str(args.optimizer).lower() == 'adamw': - optimizer = torch.optim.AdamW( - param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - betas=(args.adam_beta1, args.adam_beta2), - eps=args.adam_eps - ) - elif args.optimizer == 'adam': - if args.ds_fused_adam: - # global Adam - from deepspeed.ops.adam import FusedAdam - Adam = FusedAdam - else: - Adam = torch.optim.Adam - optimizer = Adam( - param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - betas=(args.adam_beta1, args.adam_beta2), - eps=args.adam_eps - ) - elif args.optimizer == 'sgd': - optimizer = torch.optim.SGD( - param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - momentum=args.sgd_momentum - ) + elif str(args.optimizer).lower() == 'apex.sgd': + from apex.optimizers import FusedSGD as SGD + optimizer = SGD( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + momentum=args.sgd_momentum + ) + elif str(args.optimizer).lower() == 'adamw': + optimizer = torch.optim.AdamW( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps + ) + elif args.optimizer == 'adam': + if args.ds_fused_adam: + # global Adam + from deepspeed.ops.adam import FusedAdam + Adam = FusedAdam else: - raise TypeError(f'{args.optimizer} optimizer is not supported.') + Adam = torch.optim.Adam + optimizer = Adam( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps + ) + elif args.optimizer == 'sgd': + optimizer = torch.optim.SGD( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + momentum=args.sgd_momentum + ) + else: + raise TypeError(f'{args.optimizer} optimizer is not supported.') if args.deepspeed: return optimizer diff --git a/megatron/training.py b/megatron/training.py index e4b2511767..634713eafb 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -1018,18 +1018,37 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, opt_stats_2 = [0.0] * 4 for _, group in enumerate(optimizer.param_groups): for _, param in enumerate(group['params']): - opt_stats[0] += (torch.norm(optimizer.state[param]['exp_avg_sq']).item())**2 - opt_stats[1] += (torch.norm(optimizer.state[param]['exp_avg_sq'].sqrt()).item())**2 - opt_stats[2] += (torch.norm(optimizer.state[param]['exp_avg']).item())**2 - opt_stats[3] += (torch.norm(param).item())**2 - opt_stats[4] += torch.norm(optimizer.state[param]['exp_avg_sq'],p=1).item() - opt_stats[5] += torch.norm(optimizer.state[param]['exp_avg_sq'].sqrt(),p=1).item() - opt_stats[6] += torch.norm(optimizer.state[param]['exp_avg'],p=1).item() - opt_stats[7] += torch.norm(param,p=1).item() - opt_stats_2[0] = max(opt_stats_2[0], abs(optimizer.state[param]['exp_avg_sq'].max().item()), abs(optimizer.state[param]['exp_avg_sq'].min().item())) - opt_stats_2[1] = max(opt_stats_2[1], optimizer.state[param]['exp_avg_sq'].sqrt().abs_().max().item()) - opt_stats_2[2] = max(opt_stats_2[2], abs(optimizer.state[param]['exp_avg'].max().item()), abs(optimizer.state[param]['exp_avg'].min().item())) - opt_stats_2[3] = max(opt_stats_2[3], abs(param.max().item()), abs(param.min().item())) + state_param = getattr(optimizer, 'state', None) + if state_param is not None: + exp_avg_sq = state_param.get('exp_avg_sq', torch.tensor(0.)) + exp_avg = state_param.get('exp_avg', torch.tensor(0.)) + opt_stats[0] += (torch.norm(exp_avg_sq).item()) ** 2 + opt_stats[1] += (torch.norm(exp_avg_sq.sqrt()).item()) ** 2 + opt_stats[2] += (torch.norm(exp_avg).item()) ** 2 + opt_stats[3] += (torch.norm(param).item()) ** 2 + opt_stats[4] += torch.norm(exp_avg_sq, p=1).item() + opt_stats[5] += torch.norm(exp_avg_sq.sqrt(), p=1).item() + opt_stats[6] += torch.norm(exp_avg, p=1).item() + opt_stats[7] += torch.norm(param, p=1).item() + opt_stats_2[0] = max( + opt_stats_2[0], + abs(exp_avg_sq.max().item()), + abs(exp_avg_sq.min().item()) + ) + opt_stats_2[1] = max( + opt_stats_2[1], + exp_avg_sq.sqrt().abs_().max().item() + ) + opt_stats_2[2] = max( + opt_stats_2[2], + abs(exp_avg.max().item()), + abs(exp_avg.min().item()) + ) + opt_stats_2[3] = max( + opt_stats_2[3], + abs(param.max().item()), + abs(param.min().item()) + ) # print('step {} rank {} before sync opt_stats {}, {}'.format(iteration, torch.distributed.get_rank(), opt_stats_2, opt_stats)) if args.zero_stage > 0: # ZeRO partiions optimizer states From a68ed8ebaeef43b2d5bcb63d8e8da905f40b7284 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 16 Apr 2024 15:38:25 -0500 Subject: [PATCH 174/268] update `train_llama_alcf.sh` --- train_llama_alcf.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index f0c939b9f7..ce18842850 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -47,6 +47,15 @@ export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit TBDIR="${CKPT_DIR}/tensorboard" mkdir -p "${TBDIR}" +# TORCH_DEVICE=$(python3 -c 'import ezpz as ez; print(ez.get_torch_device())') +# printf %s "Using TORCH_DEVICE=${TORCH_DEVICE}" +# +# if [[ "${TORCH_DEVICE}" == "cuda" ]]; then +# printf %s "Setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True" +# PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +# fi + + # source "${HERE}/venvs/polaris/2024-03-14/bin/activate" || exit # echo "Using $(which python3)" # --launcher_args='--pmi=pmix' @@ -55,6 +64,9 @@ mkdir -p "${TBDIR}" # --use-flash-attn-v2 \ # --num-workers 0 \ + # aprun -n "${NGPUS}" -N "${NGPU_PER_HOST}" --pmi=pmix ${PBS_O_WORKDIR}/local_rank.sh + # ${DIST_LAUNCH} $(which python3) ${EXEC} \ +# yeet="${DIST_LAUNCH} ./local_rank.sh" run_cmd=" deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ --$DTYPE \ From a70aa6e95b8c9e24a86c59f6f9401828f7c49aaf Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 16 Apr 2024 15:38:49 -0500 Subject: [PATCH 175/268] Update `pretrain_gpt_alcf.py` --- pretrain_gpt_alcf.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 46036efbad..0139330277 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -36,34 +36,33 @@ import time from torch import nn import torch.nn.functional as F +import ezpz as ez -# from ezpz import get_logger -from ezpz.dist import get_world_size, setup_wandb, get_rank -# RANK = setup_torch( +# ---- SETUP DISTRIBUTED COMMS ---- +# RANK = ez.setup_torch( # backend='deepspeed', # port='5432', # ) +RANK = ez.get_rank() +WORLD_SIZE = ez.get_world_size() +DEVICE = ez.get_torch_device() + +# --- TURN OFF LOGGER ON ALL RANK != 0 ---- log = get_logger(__name__) -RANK = get_rank() -WORLD_SIZE = get_world_size() -LEVEL = "DEBUG" if RANK == 0 else "CRITICAL" +log.setLevel("INFO") if RANK == 0 else log.setLevel("CRITICAL") +# ---- SETUP WANDB FROM RANK 0 ---------------- WANDB_MODE = os.environ.get('WANDB_MODE', None) DISABLE_WANDB = ( WANDB_MODE is not None and str(WANDB_MODE).lower() == 'disabled' ) -if RANK == 0: - log.setLevel("INFO") -else: - log.setLevel("CRITICAL") - if RANK == 0 and not DISABLE_WANDB: project_name = ( os.environ.get( - 'WB_PROJECT', + 'WB_PROJECT', # look for WB_PROJECT in env os.environ.get( - 'WANDB_PROJECT', + 'WANDB_PROJECT', # look for WANDB_PROJECT in env 'AuroraGPT' ), ) @@ -71,7 +70,7 @@ print('--------------------------------------------------') print(f"Setting up W&B from: {RANK} with {project_name}") print('--------------------------------------------------') - setup_wandb(project_name=project_name) + ez.setup_wandb(project_name=project_name) def model_provider(pre_process=True, post_process=True): @@ -591,7 +590,6 @@ def main(): ) try: from megatron.text_generation import generate_and_post_process - import ezpz as ez with torch.autocast(device_type=ez.get_torch_device(), dtype=torch.float16): response, _, _, _ = generate_and_post_process(model, prompts=["Hello world", "Nature is", "Turing test comprises", "Explain solar eclipse"], tokens_to_generate=32) if RANK == 0: From 1de3c662eb2302dc99290cfd80cf80055469c811 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 17 Apr 2024 08:05:53 -0500 Subject: [PATCH 176/268] Fix checkpointing with `schedulefree.*` optimizers --- ALCF/helpers.sh | 5 ++- megatron/checkpointing.py | 88 +++++++++++++++++++++++++++++++++++++-- megatron/training.py | 64 ++++++++++++++++------------ 3 files changed, 126 insertions(+), 31 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index bb612def4a..cd04f890b9 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -178,7 +178,7 @@ saveDSenv() { setOutput() { # ---- Specify output location -------------------------------- - export OUTPUT_PREFIX="ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" + export OUTPUT_PREFIX="ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_opt${OPT}" OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" export OUTPUT_DIR="${OUTPUT_DIR}" export OUTPUT_LOG="${OUTPUT_DIR}/output.log" @@ -235,6 +235,9 @@ setEnv() { echo "Running on Polaris !!" # ---- [load conda] --------------------- module load conda/2023-10-04; conda activate cu118-pt221 ; unset PYTHONUSERBASE + if [[ -d "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221" ]]; then + source "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221/bin/activate" + fi elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then echo "Running on Perlmutter !!" module load pytorch diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py index 3e4d20035e..04201fcc06 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -8,6 +8,12 @@ import numpy as np from deepspeed.accelerator import get_accelerator import torch +import ezpz as ez +import logging +import torch.distributed as tdist + +import yaml +from pathlib import Path from megatron import update_num_microbatches, get_tokenizer from megatron.core import mpu, tensor_parallel @@ -24,6 +30,12 @@ UNIVERSAL_CHECKPOINT_VERSION_VALUE, ) +RANK = ez.get_rank() +WORLD_SIZE = ez.get_world_size() +DEVICE = ez.get_torch_device() +log = logging.getLogger(__name__) +log.setLevel("INFO") if RANK == 0 else log.setLevel("CRITICAL") + _CHECKPOINT_VERSION = None @@ -225,6 +237,10 @@ def get_rng_state(): def save_checkpoint(iteration, model, optimizer, opt_param_scheduler): """Save a model checkpoint.""" args = get_args() + assert args is not None + iteration = args.iteration + + save_lr_state_dict() # Only rank zero of the data parallel writes to the disk. if not args.deepspeed: @@ -529,15 +545,81 @@ def _set_arg(arg_name, old_arg_name=None, force=False): return args, checkpoint_args -def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', strict=True, load_only_weights=False): +def load_lr_state_dict(strict: bool = False) -> dict: + """Load {iteration, lr} from .yaml file when restoring from checkpoint.""" + args = get_args() + assert args is not None + lr_state_dict_fp = Path(args.load).joinpath( + f"lr_state_dict_{RANK}_of_{WORLD_SIZE}.yaml" + ) + lr_state_dict = {} + if lr_state_dict_fp.is_file(): + with lr_state_dict_fp.open('r') as f: + lr_state_dict = yaml.safe_load(f) + args.lr = lr_state_dict['lr'] + else: + if strict: + raise FileNotFoundError( + f"{lr_state_dict_fp=}.is_file() is False" + ) + log.info( + f"Unable to load lr_state_dict from {lr_state_dict_fp=}, " + f"but strict=False. Returning empty dictionary: {lr_state_dict=}" + ) + return lr_state_dict + + +def save_lr_state_dict() -> None: + """Save {iteration, lr} to .yaml file for safe-keeping. + + Make sure we're only saving from RANK == 0. + """ + if RANK != 0: + return None + args = get_args() + assert args is not None + outdir = getattr(args, 'save', None) + assert outdir is not None + lr_state_dict_fp = Path(args.save).joinpath( + "lr_state_dict.yaml" + ) + log.info(f"Saving lr_state_dict to {lr_state_dict_fp.as_posix()}") + with lr_state_dict_fp.open('w') as f: + yaml.dump( + {'iteration': args.iteration, 'lr': args.lr}, + f + ) + + +def load_checkpoint( + model, + optimizer, + opt_param_scheduler, + load_arg: str = 'load', + strict: bool = True, + load_only_weights: bool = False, + strict_lr_state_dict: bool = False +): """Load a model checkpoint and return the iteration. strict (bool): whether to strictly enforce that the keys in :attr:`state_dict` of the checkpoint match the names of parameters and buffers in model. """ args = get_args() + assert args is not None load_dir = getattr(args, load_arg) - + if RANK == 0: + lr_state_dict = load_lr_state_dict(strict=strict_lr_state_dict) + lr_tensor = torch.tensor( + lr_state_dict['lr'], + requires_grad=False, + device=DEVICE + ) + else: + lr_state_dict = {} + lr_tensor = torch.tensor(0., requires_grad=False, device=DEVICE) + tdist.broadcast(lr_tensor, 0) + args.lr = lr_tensor.item() if args.deepspeed: if args.finetune: loaded_dir, state_dict = model[0].load_checkpoint(load_dir, @@ -553,7 +635,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri print_rank_0(' will not load any checkpoints and will start from ' 'random') return 0 - release = False + release = False else: model = unwrap_model(model) diff --git a/megatron/training.py b/megatron/training.py index 634713eafb..c02ca8d0ef 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -10,6 +10,7 @@ # The earliest we can measure the start time. _TRAIN_START_TIME = time.time() import torch +import torch.distributed as tdist from collections import OrderedDict from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP @@ -49,9 +50,16 @@ from deepspeed.compression.compress import init_compression, redundancy_clean from deepspeed.runtime.data_pipeline.data_routing.helper import convert_to_random_ltd from megatron.model.transformer import ParallelTransformerLayer +import ezpz as ez +import logging from deepspeed import comm as dist +RANK = ez.get_rank() +WORLD_SIZE = ez.get_world_size() +log = logging.getLogger(__name__) +log.setLevel("INFO") if RANK == 0 else log.setLevel("CRITICAL") + try: import wandb except (ImportError, ModuleNotFoundError): @@ -60,9 +68,9 @@ def print_datetime(string): """Note that this call will sync across all ranks.""" - torch.distributed.barrier() + tdist.barrier() time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - print_rank_0('[' + string + '] datetime: {} '.format(time_str)) + log.info('[' + string + '] datetime: {} '.format(time_str)) def num_floating_point_operations(args, batch_size): @@ -169,10 +177,11 @@ def pretrain( # image ... launches. global _TRAIN_START_TIME start_time_tensor = get_accelerator().DoubleTensor([_TRAIN_START_TIME]) - torch.distributed.all_reduce(start_time_tensor, - op=torch.distributed.ReduceOp.MIN) + tdist.all_reduce(start_time_tensor, op=tdist.ReduceOp.MIN) + # torch.distributed.all_reduce(start_time_tensor, + # op=torch.distributed.ReduceOp.MIN) _TRAIN_START_TIME = start_time_tensor.item() - print_rank_0('time to initialize megatron (seconds): {:.3f}'.format( + log.info('time to initialize megatron (seconds): {:.3f}'.format( time.time() - _TRAIN_START_TIME)) print_datetime('after megatron is initialized') @@ -247,16 +256,16 @@ def pretrain( args.teacher_model = setup_teacher_model(args, model_provider) # Print setup timing. - print_rank_0('done with setup ...') + log.info('done with setup ...') timers.log(['model-and-optimizer-setup', 'train/valid/test-data-iterators-setup'], barrier=True) if not args.skip_train: - print_rank_0('training ...') + log.info('training ...') if args.dataloader_type == 'cyclic' and args.retro_add_retriever: args.train_iters = args.retro_cyclic_train_iters - print_rank_0("retro cyclic train iters : %d" % args.train_iters) + log.info("retro cyclic train iters : %d" % args.train_iters) iteration = 0 if args.do_train and args.train_iters > 0: @@ -273,7 +282,7 @@ def pretrain( if args.save and iteration != 0: save_checkpoint(iteration, model, optimizer, opt_param_scheduler) else: - print_rank_0('skipping training (--skip-train is on) ...') + log.info('skipping training (--skip-train is on) ...') iteration = args.iteration @@ -321,12 +330,12 @@ def update_train_iters(args): args.global_batch_size args.train_iters = iterations - print_rank_0('setting training iterations to {}'.format(args.train_iters)) + log.info('setting training iterations to {}'.format(args.train_iters)) def setup_teacher_model(args, model_provider): - print_rank_0('***>>>>> Student model checkpoint iteration:{}'.format(args.iteration)) + log.info('***>>>>> Student model checkpoint iteration:{}'.format(args.iteration)) iteration_stuent = args.iteration num_layers_student = args.num_layers num_experts_student = args.num_experts @@ -334,7 +343,7 @@ def setup_teacher_model(args, model_provider): num_attention_heads_student = args.num_attention_heads load_student = args.load - print_rank_0('***>>>>> Setting up the teacher model') + log.info('***>>>>> Setting up the teacher model') args.num_layers = args.num_layers_teacher args.num_experts = args.num_experts_teacher @@ -342,7 +351,7 @@ def setup_teacher_model(args, model_provider): args.num_attention_heads = args.num_attention_heads_teacher args.load = args.load_teacher teacher_model, _, _ = load_model_weights_only(model_provider) - print_rank_0('***>>>>> Teacher model:{}'.format(teacher_model)) + log.info('***>>>>> Teacher model:{}'.format(teacher_model)) args.num_layers = num_layers_student args.num_experts = num_experts_student @@ -518,7 +527,7 @@ def get_optimizer_param_scheduler(optimizer): def load_model_weights_only(model_provider_func): """Setup model and optimizer.""" args = get_args() - print_rank_0('***>>>>> Args:{}'.format(args)) + log.info('***>>>>> Args:{}'.format(args)) model = get_model(model_provider_func) @@ -577,7 +586,7 @@ def setup_model_and_optimizer(model_provider_func, else: args.iteration = 0 student_global_steps = model[0].global_steps - print_rank_0('***>>>>> Student model, global step:{}'.format(student_global_steps)) + log.info('***>>>>> Student model, global step:{}'.format(student_global_steps)) if args.compression_training: model, _, _, _ = deepspeed.initialize( @@ -605,7 +614,7 @@ def setup_model_and_optimizer(model_provider_func, opt_param_scheduler = get_optimizer_param_scheduler(optimizer) if args.deepspeed: - print_rank_0("DeepSpeed is enabled.") + log.info("DeepSpeed is enabled.") pp = mpu.get_pipeline_model_parallel_world_size() if args.data_efficiency_curriculum_learning and build_train_valid_test_datasets_provider is not None: train_ds = None @@ -681,7 +690,7 @@ def setup_model_and_optimizer(model_provider_func, # get model without FP16 and/or TorchDDP wrappers if args.iteration == 0 and len(unwrapped_model) == 1 \ and hasattr(unwrapped_model[0], 'init_state_dict_from_bert'): - print_rank_0("Initializing ICT from pretrained BERT model") + log.info("Initializing ICT from pretrained BERT model") unwrapped_model[0].init_state_dict_from_bert() if args.fp16: optimizer.reload_model_params() @@ -1238,7 +1247,8 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, total_loss_dict[advanced_iters_key] = 0 total_loss_dict[skipped_iters_key] = 0 total_loss_dict[nan_iters_key] = 0 - print_rank_last(log_string) + # print_rank_last(log_string) + log.info(log_string) if report_memory_flag and learning_rate > 0.: # Report memory after optimizer state has been initialized. report_memory('(after {} iterations)'.format(iteration)) @@ -1465,7 +1475,7 @@ def evaluate(forward_step_func, while iteration < args.eval_iters: iteration += 1 if verbose and iteration % args.log_interval == 0: - print_rank_0('Evaluating iter {}/{}'.format(iteration, + log.info('Evaluating iter {}/{}'.format(iteration, args.eval_iters)) forward_backward_func = get_forward_backward_func() @@ -1576,9 +1586,9 @@ def evaluate_and_print_results(prefix, forward_step_func, process_non_loss_data_func(collected_non_loss_data, iteration, writer) length = len(string) + 1 - print_rank_last('-' * length) - print_rank_last(string) - print_rank_last('-' * length) + log.info('-' * length) + log.info(string) + log.info('-' * length) def cyclic_iter(iter): @@ -1603,10 +1613,10 @@ def build_train_valid_test_datasets(build_train_valid_test_datasets_provider): train_val_test_num_samples = [train_samples, eval_iters * args.global_batch_size, test_iters * args.global_batch_size] - print_rank_0(' > datasets target sizes (minimum size):') - print_rank_0(' train: {}'.format(train_val_test_num_samples[0])) - print_rank_0(' validation: {}'.format(train_val_test_num_samples[1])) - print_rank_0(' test: {}'.format(train_val_test_num_samples[2])) + log.info(' > datasets target sizes (minimum size):') + log.info(' train: {}'.format(train_val_test_num_samples[0])) + log.info(' validation: {}'.format(train_val_test_num_samples[1])) + log.info(' test: {}'.format(train_val_test_num_samples[2])) # Build the datasets. return build_train_valid_test_datasets_provider(train_val_test_num_samples) @@ -1620,7 +1630,7 @@ def build_train_valid_test_data_loaders( (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None) - print_rank_0('> building train, validation, and test datasets ...') + log.info('> building train, validation, and test datasets ...') # Backward compatibility, assume fixed batch size. if args.iteration > 0 and args.consumed_train_samples == 0: From aa2cd591cec5a355b59abd5ec248aed1f462f79f Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 17 Apr 2024 08:24:58 -0500 Subject: [PATCH 177/268] Fix checkpointing with `schedulefree.*` optimizers --- megatron/checkpointing.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py index 04201fcc06..d585baf717 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -608,16 +608,16 @@ def load_checkpoint( args = get_args() assert args is not None load_dir = getattr(args, load_arg) + lr_state_dict = {} + lr_tensor = torch.tensor(args.lr, requires_grad=False, device=DEVICE) if RANK == 0: lr_state_dict = load_lr_state_dict(strict=strict_lr_state_dict) - lr_tensor = torch.tensor( - lr_state_dict['lr'], - requires_grad=False, - device=DEVICE - ) - else: - lr_state_dict = {} - lr_tensor = torch.tensor(0., requires_grad=False, device=DEVICE) + if len(lr_state_dict.keys()) > 0 and 'lr' in lr_state_dict: + lr_tensor = torch.tensor( + lr_state_dict['lr'], + requires_grad=False, + device=DEVICE, + ) tdist.broadcast(lr_tensor, 0) args.lr = lr_tensor.item() if args.deepspeed: From a365a182a3f17edf90d59021088ef28855ead611 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 17 Apr 2024 09:03:41 -0500 Subject: [PATCH 178/268] Add `--schedulefree-foreach` flag From https://github.com/facebookresearch/schedule_free/blob/6db4953ccde631baca5f5fd2d9e5cab21caf9ed4/schedulefree/adamw_schedulefree.py#L39 --- megatron/arguments.py | 35 +++++++++---- megatron/optimizer/__init__.py | 95 +++++++++++++++++++--------------- 2 files changed, 78 insertions(+), 52 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 5f9b6f4144..b982337f51 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -915,17 +915,30 @@ def _add_training_args(parser): group.add_argument('--disable-bias-linear', action='store_false', help='Disable bias in the linear layers', dest='add_bias_linear') - group.add_argument('--optimizer', type=str, default='adam', - choices=[ - 'adam', - 'adamw', - 'sgd', - 'apex.adam', - 'apex.sgd', - 'adamwschedulefree', - 'sgdschedulefree' - ], - help='Optimizer function') + group.add_argument( + '--optimizer', + type=str, + default='adam', + choices=[ + 'adam', + 'adamw', + 'sgd', + 'apex.adam', + 'apex.sgd', + 'adamwschedulefree', + 'sgdschedulefree' + ], + help='Optimizer function' + ) + group.add_argument( + "--schedulefree-for-each", + action="store_true", + help=""" + Use a foreach-backed implementation of the schedulefree optimizers. + Should be significantly faster, + but will have a higher peak memory usage. + """, + ) group.add_argument('--dataloader-type', type=str, default=None, choices=['single', 'cyclic'], help='Single pass vs multiple pass data loader') diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index fdaa1f7dc7..6dfd0ea40f 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -60,21 +60,29 @@ def get_param_groups(modules, return param_groups -def get_megatron_optimizer(model, - no_weight_decay_cond=None, - scale_lr_cond=None, - lr_mult=1.0): + +def get_megatron_optimizer( + model, + no_weight_decay_cond=None, + scale_lr_cond=None, + lr_mult=1.0 +): args = get_args() # Base optimizer. - param_groups = get_param_groups(model, - no_weight_decay_cond, - scale_lr_cond, - lr_mult) + param_groups = get_param_groups( + model, + no_weight_decay_cond, + scale_lr_cond, + lr_mult + ) if args.create_moe_param_group: - from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer - param_groups = split_params_into_different_moe_groups_for_optimizer(param_groups) - + from deepspeed.moe.utils import ( + split_params_into_different_moe_groups_for_optimizer + ) + param_groups = split_params_into_different_moe_groups_for_optimizer( + param_groups + ) if args.cpu_optimizer: assert args.optimizer == 'adam', 'CPU offloading is for Adam' if args.cpu_torch_adam: @@ -82,36 +90,44 @@ def get_megatron_optimizer(model, else: from deepspeed.ops.adam import DeepSpeedCPUAdam cpu_adam_optimizer = DeepSpeedCPUAdam - optimizer = cpu_adam_optimizer(param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - betas=(args.adam_beta1, args.adam_beta2), - eps=args.adam_eps) + optimizer = cpu_adam_optimizer( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + ) elif str(args.optimizer).lower() == 'adamwschedulefree': import schedulefree optimizer = schedulefree.AdamWScheduleFree( param_groups, lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, warmup_steps=args.lr_warmup_iters, + foreach=args.schedulefree_for_each, ) elif str(args.optimizer).lower() == 'sgdschedulefree': import schedulefree optimizer = schedulefree.SGDScheduleFree( param_groups, lr=args.lr, + momentum=args.sgd_momentum, + weight_decay=args.weight_decay, warmup_steps=args.lr_warmup_iters, + foreach=args.schedulefree_for_each, ) - # else: elif str(args.optimizer).lower() == 'apex.adam': - assert get_accelerator().device_name() == 'cuda' - from apex.optimizers import FusedAdam as Adam - optimizer = Adam( - param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - betas=(args.adam_beta1, args.adam_beta2), - eps=args.adam_eps - ) + assert get_accelerator().device_name() == 'cuda' + from apex.optimizers import FusedAdam as Adam + optimizer = Adam( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps + ) elif str(args.optimizer).lower() == 'apex.sgd': from apex.optimizers import FusedSGD as SGD optimizer = SGD( @@ -153,18 +169,15 @@ def get_megatron_optimizer(model, raise TypeError(f'{args.optimizer} optimizer is not supported.') if args.deepspeed: return optimizer - # Determine whether the params have main-grad field. params_have_main_grad = False if args.use_contiguous_buffers_in_local_ddp: params_have_main_grad = True - # Mixed precision optimizer. # - Note: both the Float16Optimizer and the DistributedOptimizer inherit # from the MixedPrecisionOptimizer, which manages any optimizer where # the model params and main params are distinct. if args.fp16 or args.bf16 or args.use_distributed_optimizer: - # Grad scaler: # if loss-scale is provided, instantiate the constant scaler. # if we are using fp16 and loss-scale is not present, use a @@ -172,11 +185,9 @@ def get_megatron_optimizer(model, # otherwise we are running in bf16 with no loss-scale so # leave it as None. grad_scaler = None - # Constant loss scale. if args.loss_scale: grad_scaler = ConstantGradScaler(args.loss_scale) - # Dynamic loss scale. else: if args.fp16: @@ -187,11 +198,11 @@ def get_megatron_optimizer(model, backoff_factor=0.5, growth_interval=args.loss_scale_window, hysteresis=args.hysteresis) - # Megatron optimizer. - opt_ty = DistributedOptimizer \ - if args.use_distributed_optimizer else \ - Float16OptimizerWithFloat16Params + opt_ty = ( + DistributedOptimizer if args.use_distributed_optimizer + else Float16OptimizerWithFloat16Params + ) return opt_ty(optimizer, args.clip_grad, args.log_num_zeros_in_grad, @@ -202,10 +213,12 @@ def get_megatron_optimizer(model, args.params_dtype, grad_scaler, model) - # FP32. - return FP32Optimizer(optimizer, args.clip_grad, - args.log_num_zeros_in_grad, - params_have_main_grad, - args.use_contiguous_buffers_in_local_ddp, - model) + return FP32Optimizer( + optimizer, + args.clip_grad, + args.log_num_zeros_in_grad, + params_have_main_grad, + args.use_contiguous_buffers_in_local_ddp, + model + ) From eecf70d55ea4c8b46cb714bc127087aa645299aa Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 17 Apr 2024 09:05:16 -0500 Subject: [PATCH 179/268] Replace `{:.6E}` with `{:.6f}` in `log_string` formatting --- megatron/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/training.py b/megatron/training.py index c02ca8d0ef..43fd359ab0 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -1218,7 +1218,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, avg = total_loss_dict[key].item() / \ float(max(1, total_loss_dict[advanced_iters_key])) if avg > 0.0: - log_string += ' {}: {:.6E} |'.format(key, avg) + log_string += ' {}: {:.6f} |'.format(key, avg) total_loss_dict[key] = get_accelerator().FloatTensor([0.0]) if loss_scale is not None: log_string += ' loss scale: {:.1f} |'.format(loss_scale) From 6969dc2d09bc845e3982aa0fb416760830cca9c3 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 17 Apr 2024 11:06:11 -0500 Subject: [PATCH 180/268] Update `megatron/training.py` --- megatron/training.py | 45 ++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 43fd359ab0..67cd47bda1 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -70,7 +70,7 @@ def print_datetime(string): """Note that this call will sync across all ranks.""" tdist.barrier() time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - log.info('[' + string + '] datetime: {} '.format(time_str)) + log.info('[' + string + '] datetime={} '.format(time_str)) def num_floating_point_operations(args, batch_size): @@ -181,7 +181,7 @@ def pretrain( # torch.distributed.all_reduce(start_time_tensor, # op=torch.distributed.ReduceOp.MIN) _TRAIN_START_TIME = start_time_tensor.item() - log.info('time to initialize megatron (seconds): {:.3f}'.format( + log.info('time to initialize megatron (seconds)={:.3f}'.format( time.time() - _TRAIN_START_TIME)) print_datetime('after megatron is initialized') @@ -435,7 +435,7 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap # Print number of parameters. if mpu.get_data_parallel_rank() == 0: print(' > number of parameters on (tensor, pipeline) ' - 'model parallel rank ({}, {}): {}'.format( + 'model parallel rank ({}, {})={}'.format( mpu.get_tensor_model_parallel_rank(), mpu.get_pipeline_model_parallel_rank(), sum([sum([p.ds_numel if hasattr(p,'ds_id') else p.nelement() for p in model_module.parameters()]) @@ -1185,18 +1185,18 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, log_string = f' iteration {iteration:8d}/{args.train_iters:8d} |' # .format( iteration, args.train_iters) log_string += ( - f' consumed samples: {args.consumed_train_samples:12d} |' + f' consumed samples={args.consumed_train_samples:12d} |' # .format(args.consumed_train_samples) ) - log_string += f' consumed tokens: {args.consumed_train_tokens:12d} |' + log_string += f' consumed tokens={args.consumed_train_tokens:12d} |' # .format( args.consumed_train_tokens) log_string += ( ' elapsed time per iteration (ms): ' f'{elapsed_time_per_iteration * 1000.0:.1f} |' # .format( elapsed_time_per_iteration * 1000.0) ) - log_string += f' learning rate: {learning_rate:.3E} |' - log_string += f' global batch size: {batch_size:5d} |' + log_string += f' learning rate={learning_rate:.3f} |' + log_string += f' global batch size={batch_size:5d} |' # if wandb is not None and getattr(wandb, 'run', None) is not None: wandb_metrics |= { 'training/iteration': iteration, @@ -1218,32 +1218,32 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, avg = total_loss_dict[key].item() / \ float(max(1, total_loss_dict[advanced_iters_key])) if avg > 0.0: - log_string += ' {}: {:.6f} |'.format(key, avg) + log_string += ' {}={:.6f} |'.format(key, avg) total_loss_dict[key] = get_accelerator().FloatTensor([0.0]) if loss_scale is not None: - log_string += ' loss scale: {:.1f} |'.format(loss_scale) + log_string += ' loss scale={:.1f} |'.format(loss_scale) wandb_metrics |= {'loss/loss_scale': loss_scale} if grad_norm is not None: - log_string += ' grad norm: {:.3f} |'.format(grad_norm) + log_string += ' grad norm={:.3f} |'.format(grad_norm) wandb_metrics |= {'loss/grad_norm': grad_norm} if num_zeros_in_grad is not None: - log_string += ' num zeros: {:.1f} |'.format(num_zeros_in_grad) + log_string += ' num zeros={:.1f} |'.format(num_zeros_in_grad) wandb_metrics |= {'loss/num_zeros_in_grad': num_zeros_in_grad} if params_norm is not None: - log_string += ' params norm: {:.3f} |'.format(params_norm) + log_string += ' params norm={:.3f} |'.format(params_norm) wandb_metrics |= {'loss/params_norm': params_norm} if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: - log_string += ' curriculum seqlen: {:5d} |'.format(args.curriculum_seqlen) + log_string += ' curriculum seqlen={:5d} |'.format(args.curriculum_seqlen) if args.random_ltd: - log_string += ' random ltd reserved length: {:5d} |'.format(args.random_ltd_reserved_length) - log_string += ' actual seqlen: {:5d} |'.format(seq_len) - log_string += ' number of skipped iterations: {:3d} |'.format( + log_string += ' random ltd reserved length={:5d} |'.format(args.random_ltd_reserved_length) + log_string += ' actual seqlen={:5d} |'.format(seq_len) + log_string += ' number of skipped iterations={:3d} |'.format( total_loss_dict[skipped_iters_key]) - log_string += ' number of nan iterations: {:3d} |'.format( + log_string += ' number of nan iterations={:3d} |'.format( total_loss_dict[nan_iters_key]) - log_string += ' samples per second: {:.3f} |'.format(samples_per_sec) - log_string += ' tokens per gpu per second (tgs): {:.3f} |'.format(tokens_per_gpu_per_second) - log_string += ' TFLOPs: {:.2f} |'.format(tflops) + log_string += ' samples per second={:.3f} |'.format(samples_per_sec) + log_string += ' tokens per gpu per second (tgs)={:.3f} |'.format(tokens_per_gpu_per_second) + log_string += ' TFLOPs={:.2f} |'.format(tflops) total_loss_dict[advanced_iters_key] = 0 total_loss_dict[skipped_iters_key] = 0 total_loss_dict[nan_iters_key] = 0 @@ -1560,9 +1560,10 @@ def evaluate_and_print_results(prefix, forward_step_func, process_non_loss_data_func, config, verbose) string = ' validation loss at {} | '.format(prefix) for key in total_loss_dict: - string += '{} value: {:.6E} | '.format(key, total_loss_dict[key].item()) + string += f"{key} value={total_loss_dict[key].item():.6f}" ppl = math.exp(min(20, total_loss_dict[key].item())) - string += '{} PPL: {:.6E} | '.format(key, ppl) + string += f"{key} PPL={ppl:.6f}" + # string += '{} PPL={:.6f} | '.format(key, ppl) if writer and is_last_rank(): data_type = 'test' if test else 'validation' writer.add_scalar(f'lm-loss-validation/{key} {data_type}', From 1d30d41dd33de0e26d9253e581120c10aa8c99c7 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 18 Apr 2024 08:39:03 -0500 Subject: [PATCH 181/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 63 ++++++++----------------------------------------- 1 file changed, 10 insertions(+), 53 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index cd04f890b9..151c3e857a 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -28,9 +28,6 @@ function setupSrun() { function setDSlauncher() { # launcher setting outdir=$1 - # hfds=$1 - # hfmpi=$2 - # here=$(python3 -c 'import os; print(os.getcwd())') export hfds="$outdir/hostfile_deepspeed" export hfmpi="$outdir/hostfile_mpich" [ -f "$hfds" ] || exit @@ -90,8 +87,8 @@ setParams() { export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} # ---- Llama2 7B Config ------------------------------ export MODEL_KEY="Llama-7B" - export HEADS=${HEADS:-32} - export NLAYERS=${NLAYERS:-32} + export HEADS=${HEADS:-${NHEADS:-32}} + export NLAYERS=${NLAYERS:-${NUM_LAYERS:-32}} export HIDDEN=${HIDDEN:-4096} export NUM_KV_HEAD=${NUM_KV_HEAD:-8} export FFN_HIDDEN_SIZE=${FFN_HIDDEN_SIZE:-11008} @@ -146,6 +143,10 @@ setArgs() { export gpt_args } +# +---------------------------------------+ +# | 1. Git clone `ezpz` (if not found) | +# | 2. Install `ezpz` (if not installed) | +# +---------------------------------------+ ezpz() { if [[ ! -d ezpz ]]; then git clone https://github.com/saforem2/ezpz @@ -164,6 +165,10 @@ ezpz() { # source ezpz/src/ezpz/bin/getjobenv || exit } +# +------------------------------------------------------------------------+ +# | Save important environment variables to .deepspeed_env, which will be | +# | forwarded to ALL ranks with DeepSpeed | +# +------------------------------------------------------------------------+ saveDSenv() { echo "Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env" { @@ -202,8 +207,6 @@ buildDSconfig() { sumWeights() { local file_list=$1 weights=$(cat "${file_list}" | awk '{print $1}' | tr '\n' '\ ,\ ' | sed 's/^/[/g' | sed 's/$/]/g' | tr '\ ' "\,\ ") - # weights=$(echo "$weights" | tr ",]" "]") - # echo "weights: $weights" python3 -c "import numpy as np; print(np.sum(${weights}))" } @@ -298,52 +301,6 @@ setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------------- echo "--------------------" } -# buildCLIargs() { # ---- [BROKEN] ------------------------------------------- -# custom_args=" $@" -# export CLI_ARGS=" -# --$DTYPE \ -# --num-workers 0 \ -# --split 100,0,0 \ -# --log-interval 1 \ -# --use-flash-attn-v2 \ -# --no-bias-gelu-fusion \ -# --lr-decay-style cosine \ -# --no-bias-dropout-fusion \ -# --no-masked-softmax-fusion \ -# --tokenizer-type Llama2Tokenizer \ -# --no-gradient-accumulation-fusion \ -# --accumulate-allreduce-grads-in-fp32 \ -# --use-checkpoint-opt_param-scheduler \ -# --lr ${LR} \ -# --save ${CKPT_DIR} \ -# --load ${CKPT_DIR} \ -# --seq-length ${SEQ} \ -# --num-layers ${NLAYERS} \ -# --hidden-size ${HIDDEN} \ -# --train-iters ${TRAIN_ITER} \ -# --eval-iters ${EVAL_ITERS} \ -# --distributed-backend ${BE} \ -# --num-attention-heads ${HEADS} \ -# --save-interval ${SAVE_INTERVAL} \ -# --eval-interval ${EVAL_INTERVAL} \ -# --max-position-embeddings ${SEQ} \ -# --micro-batch-size ${MICRO_BATCH} \ -# --data-file-list ${DATA_FILE_LIST} \ -# --tensor-model-parallel-size ${TP} \ -# --global-batch-size ${GLOBAL_BATCH} \ -# --pipeline-model-parallel-size ${PP} \ -# --num-key-value-heads ${NUM_KV_HEAD} \ -# --data-cache-path ${DATA_CACHE_PATH} \ -# --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ -# --tokenizer-model ${TOKENIZER_MODEL} \ -# $ds_args \ -# ${LLAMA_ARGS} \ -# ${gpt_args[*]} \ -# ${custom_args} \ -# " -# } - - printBlack() { printf "\e[1;30m%s\e[0m\n" "$@" } From 981b7d9896e4c416a842e3b3af09b3f36d46f889 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 18 Apr 2024 08:39:35 -0500 Subject: [PATCH 182/268] Update logging in `megatron/training.py` --- megatron/training.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 67cd47bda1..2bdf61f908 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -1182,21 +1182,21 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, elapsed_time_per_iteration, args.consumed_train_samples) writer.add_scalar('iteration-time/iteration-time vs tokens', elapsed_time_per_iteration, args.consumed_train_tokens) - log_string = f' iteration {iteration:8d}/{args.train_iters:8d} |' + log_string = f' iteration={iteration:8d}/{args.train_iters:8d} |' # .format( iteration, args.train_iters) log_string += ( - f' consumed samples={args.consumed_train_samples:12d} |' + f' consumed_samples={args.consumed_train_samples:12d} |' # .format(args.consumed_train_samples) ) - log_string += f' consumed tokens={args.consumed_train_tokens:12d} |' + log_string += f' consumed_tokens={args.consumed_train_tokens:12d} |' # .format( args.consumed_train_tokens) log_string += ( - ' elapsed time per iteration (ms): ' + ' elapsed_time_per_iteration_ms=' f'{elapsed_time_per_iteration * 1000.0:.1f} |' # .format( elapsed_time_per_iteration * 1000.0) ) - log_string += f' learning rate={learning_rate:.3f} |' - log_string += f' global batch size={batch_size:5d} |' + log_string += f' learning_rate={learning_rate:.6f} |' + log_string += f' global_batch_size={batch_size:5d} |' # if wandb is not None and getattr(wandb, 'run', None) is not None: wandb_metrics |= { 'training/iteration': iteration, @@ -1221,28 +1221,28 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, log_string += ' {}={:.6f} |'.format(key, avg) total_loss_dict[key] = get_accelerator().FloatTensor([0.0]) if loss_scale is not None: - log_string += ' loss scale={:.1f} |'.format(loss_scale) + log_string += ' loss_scale={:.1f} |'.format(loss_scale) wandb_metrics |= {'loss/loss_scale': loss_scale} if grad_norm is not None: - log_string += ' grad norm={:.3f} |'.format(grad_norm) + log_string += ' grad_norm={:.3f} |'.format(grad_norm) wandb_metrics |= {'loss/grad_norm': grad_norm} if num_zeros_in_grad is not None: - log_string += ' num zeros={:.1f} |'.format(num_zeros_in_grad) + log_string += ' num_zeros={:.1f} |'.format(num_zeros_in_grad) wandb_metrics |= {'loss/num_zeros_in_grad': num_zeros_in_grad} if params_norm is not None: - log_string += ' params norm={:.3f} |'.format(params_norm) + log_string += ' params_norm={:.3f} |'.format(params_norm) wandb_metrics |= {'loss/params_norm': params_norm} if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: - log_string += ' curriculum seqlen={:5d} |'.format(args.curriculum_seqlen) + log_string += ' curriculum_seqlen={:5d} |'.format(args.curriculum_seqlen) if args.random_ltd: - log_string += ' random ltd reserved length={:5d} |'.format(args.random_ltd_reserved_length) - log_string += ' actual seqlen={:5d} |'.format(seq_len) - log_string += ' number of skipped iterations={:3d} |'.format( + log_string += ' random_ltd reserved_length={:5d} |'.format(args.random_ltd_reserved_length) + log_string += ' actual_seqlen={:5d} |'.format(seq_len) + log_string += ' number_of_skipped_iterations={:3d} |'.format( total_loss_dict[skipped_iters_key]) - log_string += ' number of nan iterations={:3d} |'.format( + log_string += ' number_of_nan_iterations={:3d} |'.format( total_loss_dict[nan_iters_key]) - log_string += ' samples per second={:.3f} |'.format(samples_per_sec) - log_string += ' tokens per gpu per second (tgs)={:.3f} |'.format(tokens_per_gpu_per_second) + log_string += ' samples_per_second={:.3f} |'.format(samples_per_sec) + log_string += ' tokens_per_gpu_per_second_tgs={:.3f} |'.format(tokens_per_gpu_per_second) log_string += ' TFLOPs={:.2f} |'.format(tflops) total_loss_dict[advanced_iters_key] = 0 total_loss_dict[skipped_iters_key] = 0 From 7e6a3a42a8a8707fee77c844ce83015c6ce9e4d5 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 18 Apr 2024 08:40:44 -0500 Subject: [PATCH 183/268] Remove redundant `train_llama_alcf_*.sh` --- train_llama_alcf_aurora.sh | 234 ------------------------------------ train_llama_alcf_polaris.sh | 116 ------------------ train_llama_alcf_sunspot.sh | 158 ------------------------ 3 files changed, 508 deletions(-) delete mode 100644 train_llama_alcf_aurora.sh delete mode 100644 train_llama_alcf_polaris.sh delete mode 100644 train_llama_alcf_sunspot.sh diff --git a/train_llama_alcf_aurora.sh b/train_llama_alcf_aurora.sh deleted file mode 100644 index 48651dbeb1..0000000000 --- a/train_llama_alcf_aurora.sh +++ /dev/null @@ -1,234 +0,0 @@ -#!/bin/bash --login -#PBS -l walltime=06:00:00 -#PBS -A argonne_tpc -#PBS -q prod -#PBS -l select=48 -#PBS -l filesystems=eagle:home -# - -function sourceFile() { - fp="$1" - if [[ -f "${fp}" ]]; then - echo "Found ${fp}, \`source\`-ing" - # shellcheck source="${fp}" - source "${fp}" - else - echo "ERROR: UNABLE TO SOURCE ${fp}" - fi -} - -# +++++++++++++++ SCRIPT START ++++++++++++++++++++++ -# ---- source ./helpers_alcf.sh --------------------- -cd "${PBS_O_WORKDIR}" || exit -HERE=$(python3 -c 'import os; print(os.getcwd())') -sourceFile "${HERE}/ALCF_utils/helpers_alcf.sh" || exit -# cd ~/anl_24_release_q4/llm.devkit/Megatron-DeepSpeed || exit -# eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate anl_release_q4v2 -ezpz || exit -setEnv || exit -saveDSenv || exit -makeHostfiles || exit -setupData "${DATA_FILE_LIST:-${HERE}/data_file_list_reweighted.txt}" || exit -# dfl_fallback="${HERE}/data_file_list_shuf_debug.txt" - -# # ---- DATA SETUP ------------------------------------ -# dfl_debug="./data_file_list_shuf_debug.txt" -# DATA_FILE_LIST="${DATA_FILE_LIST:-${dfl_debug}}" && export DATA_FILE_LIST="${DATA_FILE_LIST}" -# NUM_DOCS=$(wc -l < "${DATA_FILE_LIST}") && export NUM_DOCS="${NUM_DOCS}" -# WEIGHT_SUM="$(sumWeights "${DATA_FILE_LIST}")" && export WEIGHT_SUM="${WEIGHT_SUM}" -# DFL_STEM=$(echo "$DATA_FILE_LIST" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") && export DFL_STEM="${DFL_STEM}" -# dcp="${HERE}/.cache/${DFL_STEM}-index-cache" -# DATA_CACHE_PATH="${DATA_CACHE_PATH:-${dcp}}" && export DATA_CACHE_PATH="${DATA_CACHE_PATH}" -# mkdir -p "${DATA_CACHE_PATH}" -# if [[ -n "${DOLMA_CHUNK_IDX}" ]]; then -# echo "Using DOLMA CHUNK ${DOLMA_CHUNK_IDX} from ${DATA_FILE_LIST} with ${NUM_DOCS} documents..." -# else -# echo "Using NUM_DOCS=${NUM_DOCS} documents from DATA_FILE_LIST=${DATA_FILE_LIST}" -# fi - - -# ---- Parallelism Settings -------------------------- -PP=${PP:-1} -TP=${TP:-1} -export PP="${PP}" -export TP="${TP}" -export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" -export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} -# export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${PBS_NODEFILE}")} -# ---------------------------------------------------- - -# ---- Llama2 7B Config ----------------------- -export HEADS=${HEADS:-32} -export NLAYERS=${NLAYERS:-32} -export HIDDEN=${HIDDEN:-4096} -export NUM_KV_HEAD=${NUM_KV_HEAD:-8} -export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" -# --------------------------------------------- - -# ---- Run Settings --------------------------- -export LR=${LR:-0.0003} -export SEQ=${SEQ:-4096} -export DTYPE=${DTYPE:-bf16} -export ZERO_STAGE=${ZERO_STAGE:-2} -export MICRO_BATCH=${MICRO_BATCH:-4} -export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} -export TRAIN_ITER=${TRAIN_ITER:-317892} -export SAVE_INTERVAL=${SAVE_INTERVAL:-200} -export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} -export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) -export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-0} -export TOKENIZER_MODEL="/lus/gecko/projects/Aurora_deployment/AuroraGPT/datasets/dolma/utils/tokenizer.model" -# export EXTRA_ARGS="" -export LLAMA_ARGS="--no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" -# --------------------------------------------- - -# ---- Build DeepSpeed Config --------------------------------- -export DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" -bash "${HERE}/generate_config.sh" "${DS_CONFIG}" || exit -# ------------------------------------------------------------- - - -# ---- Specify output location -------------------------------- -export OUTPUT_PREFIX="ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}" -# OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME} -OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" -export OUTPUT_DIR="${OUTPUT_DIR}" -export OUTPUT_LOG="${OUTPUT_DIR}/output.log" -export CKPT_DIR="checkpoints/${OUTPUT_PREFIX}" -echo "${OUTPUT_LOG}" >> "logs/latest" -mkdir -p "${OUTPUT_DIR}" -echo "!!!Please see logs at ${OUTPUT_DIR}" - - -gpt_args=() -ds_args=" " -ds_args=" --deepspeed ${ds_args}" -if [ "$PP" == 1 ]; then - ds_args=" --no-pipeline-parallel ${ds_args}" -fi -ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" -ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" - -# BUG: [???] ---------------------------------------------------------------- -# I dont know where this came from... -# > we are now using activation checkpoint provided by megatron, see below. -# --------------------------------------------------------------------------- -# -# NOTE: [???] --------------------------------------------------------------- -# In `train_llama_alcf_polaris.sh` we also pass -# `"--checkpoint-num-layers 1"` -# ---------------------------------------------------------------------------- -if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then - echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" - ds_args=" --deepspeed-activation-checkpointing ${ds_args}" - gpt_args+=( - "--checkpoint-activations" - ) - # "--checkpoint-num-layers 1" - # --checkpoint-activations \ - # --deepspeed-activation-checkpointing -fi - -# take custom args -custom_args=" $@" - -# Ensure `./hostfile_deepspeed` and `./hostfile_mpich` exist in $(pwd) -hfds="${HERE}/hostfile_deepspeed" -hfmpi="${HERE}/hostfile_mpich" -[ -f "$hfds" ] || exit -[ -f "$hfmpi" ] || exit - -# launcher setting -LAUNCHER=${LAUNCHER:-MPICH} -if [[ $LAUNCHER == "deepspeed" ]]; then - launcher="" -else - launcher="--force_multi --hostfile ${hfds} --launcher=${LAUNCHER} --launcher_args='-hostfile ${hfmpi}'" -fi - - -if [[ $(hostname) == x4* ]]; then - CCL=${CCL:-ccl} - BE="${CCL}" -elif [[ $(hostname) == x3* ]]; then - NCCL=${NCCL:-nccl} - BE="${NCCL}" -fi -# NCCL=${NCCL:-nccl} -EXEC=pretrain_gpt_alcf.py - -# MODEL=LLAMA_7B -# OUTPUT_PREFIX=${MODEL}_z${ZERO_STAGE}_seqlen_tp${TP}_pp${PP}_sp${SP}_nl${NUM_LAYERS}_hs${HIDDEN_SIZE}_gb${BS}_mb${MBS} -echo "++++++++++++++++++++++++++++++++++++++++++++++++++" -echo "- WORLD_SIZE:${WORLD_SIZE}" -echo "- BACKEND: ${BE}" -echo "- MODEL_TYPE: ${MODEL_TYPE}" -echo "- DOCUMENT WEIGHT_SUM: ${WEIGHT_SUM}" -echo "- Using DATA_FILE_LIST: ${DATA_FILE_LIST}" -echo "- Using NUM_DOCS=${NUM_DOCS} documents from DATA_FILE_LIST=${DATA_FILE_LIST}" -echo "++++++++++++++++++++++++++++++++++++++++++++++++++" - -run_cmd=" - deepspeed $launcher ${EXEC} \ - --use-flash-attn \ - --num-key-value-heads ${NUM_KV_HEAD} \ - --tensor-model-parallel-size $TP \ - --pipeline-model-parallel-size $PP \ - --num-layers $NLAYERS \ - --hidden-size $HIDDEN \ - --num-attention-heads $HEADS \ - --seq-length $SEQ \ - --max-position-embeddings $SEQ \ - --micro-batch-size $MICRO_BATCH \ - --global-batch-size $GLOBAL_BATCH \ - --train-iters $TRAIN_ITER \ - --lr ${LR} \ - --lr-decay-style cosine \ - --log-interval 1 \ - --save-interval ${SAVE_INTERVAL} \ - --split 100,0,0 \ - --$DTYPE \ - --no-masked-softmax-fusion \ - --no-bias-gelu-fusion \ - --no-bias-dropout-fusion \ - --no-gradient-accumulation-fusion \ - --distributed-backend ${BE} \ - --tokenizer-type Llama2Tokenizer \ - --save checkpoints/${OUTPUT_PREFIX} \ - --load checkpoints/${OUTPUT_PREFIX} \ - --use-checkpoint-opt_param-scheduler \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --data-file-list ${DATA_FILE_LIST} \ - --data-cache-path ${DATA_CACHE_PATH} \ - $ds_args \ - ${LLAMA_ARGS} \ - ${gpt_args[*]} \ - $custom_args \ - |& tee ${OUTPUT_LOG} - " - # >> ${OUTPUT_LOG} 2>&1 & - # |& tee $OUTPUT_DIR/output.log - -# --ffn-hidden-size 11008 \ -# --vocab-file $VOCAB_FILE \ -# --merge-file $MERGE_FILE \ -# --lr-decay-iters 320000 \ -# --num-workers 0 \ -# --eval-iters ${EVAL_ITERS} \ -# --eval-interval ${EVAL_INTERVAL} \ -# --lr-warmup-iters 5000 \ -# --lr-decay-iters 10000 \ -# --accumulate-allreduce-grads-in-fp32 \ -# --data-impl mmap \ - -echo "All DeepSpeed(s): $(which -a deepspeed)" -echo "Using $(which deepspeed)" -ds_report - -echo "${run_cmd}" - -printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" -printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" - -eval "${run_cmd}" -set +x diff --git a/train_llama_alcf_polaris.sh b/train_llama_alcf_polaris.sh deleted file mode 100644 index 06f268f10e..0000000000 --- a/train_llama_alcf_polaris.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash --login -#PBS -l walltime=06:00:00 -#PBS -A argonne_tpc -#PBS -q prod -#PBS -l select=48 -#PBS -l filesystems=eagle:home - -function sourceFile() { - fp="$1" - echo "source-ing ${fp}" - if [[ -f "${fp}" ]]; then - # shellcheck source="${fp}" - source "${fp}" - else - echo "ERROR: UNABLE TO SOURCE ${fp}" - fi -} - -# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -# ---- 0. Navigate into `$PBS_O_WORKDIR` ------------------------------------- -cd "${PBS_O_WORKDIR}" || exit -HERE=$(python3 -c 'import os; print(os.getcwd())') -export HERE -# ---- 1. Assert `./pretrain_gpt_alcf.py` exists: ----------------------------- -export EXEC="${HERE}/pretrain_gpt_alcf.py" -[ -f "${EXEC}" ] || exit -# ---- 2. `source ./ALCF/helpers_alcf.sh`: ------------------------------------ -sourceFile "${HERE}/ALCF/helpers.sh" || exit -# ---- 3. Call fns from `./ALCF/helpers_alcf.sh` ------------------------------ -setEnv || exit # 1. load `conda` environment -saveDSenv || exit # 2. save env vars to `.deepspeed_env` -ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars -makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` -setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` -buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ -setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} -setArgs || exit # 8. specify additional `deepspeed` arguments -setData "${DATA_FILE_LIST}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset -setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` -printJobInfo || exit # 11. print job info -# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -# Take custom args -custom_args=" $@" - -# Assert `./hostfile_deepspeed` exists -export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit -TBDIR="${CKPT_DIR}/tensorboard" -mkdir -p "${TBDIR}" - -# source "${HERE}/venvs/polaris/2024-03-14/bin/activate" || exit -# echo "Using $(which python3)" -# --launcher_args='--pmi=pmix' - # deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} \ - # ${launch_cmd} \ - # --optimizer adam \ - # --use-flash-attn-v2 \ - # --num-workers 0 \ -run_cmd=" - deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ - --$DTYPE \ - --optimizer ${OPT} \ - --split 100,0,0 \ - --log-interval 1 \ - --no-bias-gelu-fusion \ - --lr-decay-style cosine \ - --no-bias-dropout-fusion \ - --no-masked-softmax-fusion \ - --tokenizer-type Llama2Tokenizer \ - --no-gradient-accumulation-fusion \ - --accumulate-allreduce-grads-in-fp32 \ - --use-checkpoint-opt_param-scheduler \ - --tensorboard-dir ${TBDIR} \ - --log-timers-to-tensorboard \ - --log-optimizer-states-to-tensorboard \ - --lr ${LR} \ - --save ${CKPT_DIR} \ - --load ${CKPT_DIR} \ - --seq-length ${SEQ} \ - --num-layers ${NLAYERS} \ - --hidden-size ${HIDDEN} \ - --train-iters ${TRAIN_ITER} \ - --eval-iters ${EVAL_ITERS} \ - --distributed-backend ${BE} \ - --num-attention-heads ${HEADS} \ - --save-interval ${SAVE_INTERVAL} \ - --eval-interval ${EVAL_INTERVAL} \ - --max-position-embeddings ${SEQ} \ - --micro-batch-size ${MICRO_BATCH} \ - --data-file-list ${DATA_FILE_LIST} \ - --tensor-model-parallel-size ${TP} \ - --global-batch-size ${GLOBAL_BATCH} \ - --pipeline-model-parallel-size ${PP} \ - --num-key-value-heads ${NUM_KV_HEAD} \ - --data-cache-path ${DATA_CACHE_PATH} \ - --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ - --tokenizer-model ${TOKENIZER_MODEL} \ - ${LLAMA_ARGS} \ - $ds_args \ - ${gpt_args[*]} \ - $custom_args \ - |& tee ${OUTPUT_LOG} - " - - -# echo "All DeepSpeed(s): $(which -a deepspeed)" -echo "! Using $(which deepspeed)" -ds_report - -echo "${run_cmd}" - -printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" -printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" -# echo "${OUTPUT_LOG}" -eval "${run_cmd}" -set +x diff --git a/train_llama_alcf_sunspot.sh b/train_llama_alcf_sunspot.sh deleted file mode 100644 index 700f5dbfdb..0000000000 --- a/train_llama_alcf_sunspot.sh +++ /dev/null @@ -1,158 +0,0 @@ -#!/bin/bash --login -#PBS -l walltime=06:00:00 -#PBS -A argonne_tpc -#PBS -q prod -#PBS -l select=48 -#PBS -l filesystems=eagle:home - -function sourceFile() { - fp="$1" - echo "source-ing ${fp}" - if [[ -f "${fp}" ]]; then - # shellcheck source="${fp}" - source "${fp}" - else - echo "ERROR: UNABLE TO SOURCE ${fp}" - fi -} - -module () { - if [ -z "${LMOD_SH_DBG_ON+x}" ] - then - case "$-" in - (*v*x*) __lmod_sh_dbg='vx' ;; - (*v*) __lmod_sh_dbg='v' ;; - (*x*) __lmod_sh_dbg='x' ;; - esac - fi - if [ -n "${__lmod_sh_dbg:-}" ] - then - set +$__lmod_sh_dbg - echo "Shell debugging temporarily silenced: export LMOD_SH_DBG_ON=1 for Lmod's output" >&2 - fi - eval "$($LMOD_CMD $LMOD_SHELL_PRGM "$@")" && eval "$(${LMOD_SETTARG_CMD:-:} -s sh)" - __lmod_my_status=$? - if [ -n "${__lmod_sh_dbg:-}" ] - then - echo "Shell debugging restarted" >&2 - set -$__lmod_sh_dbg - fi - unset __lmod_sh_dbg - return $__lmod_my_status -} - - -# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -# ---- 0. Navigate into `$PBS_O_WORKDIR` ------------------------------------- -cd "${PBS_O_WORKDIR}" || exit -HERE=$(python3 -c 'import os; print(os.getcwd())') -export HERE -# ---- 1. Assert `./pretrain_gpt_alcf.py` exists: ----------------------------- -export EXEC="${HERE}/pretrain_gpt_alcf.py" -[ -f "${EXEC}" ] || exit -# ---- 2. `source ./ALCF/helpers_alcf.sh`: ------------------------------------ -sourceFile "${HERE}/ALCF/helpers.sh" || exit -# ---- 3. Call fns from `./ALCF/helpers_alcf.sh` ------------------------------------------------------------------ -setEnv || exit # 1. load `conda` environment -saveDSenv || exit # 2. save env vars to `.deepspeed_env` -ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars -makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` -setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` -buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ -setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} -setArgs || exit # 8. specify additional `deepspeed` arguments -setData "${DATA_FILE_LIST}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset -setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` -printJobInfo || exit # 11. print job info -# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -# Take custom args -custom_args=" $@" - -# Assert `./hostfile_deepspeed` exists -export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit - -TBDIR="${CKPT_DIR}/tensorboard" -mkdir -p "${TBDIR}" - - # --use-flash-attn-v2 \ - # --use-flash-attn \ - # --$DTYPE \ - # --optimizer adamw \ - # --adam-beta1 0.9 \ - # --adam-beta2 0.95 \ -run_cmd=" - deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ - --${DTYPE} \ - --optimizer ${OPT} \ - --num-workers 0 \ - --split 100,0,0 \ - --log-interval 1 \ - --no-bias-gelu-fusion \ - --lr-decay-style cosine \ - --no-bias-dropout-fusion \ - --no-masked-softmax-fusion \ - --tokenizer-type Llama2Tokenizer \ - --no-gradient-accumulation-fusion \ - --accumulate-allreduce-grads-in-fp32 \ - --use-checkpoint-opt_param-scheduler \ - --tensorboard-dir ${TBDIR} \ - --log-timers-to-tensorboard \ - --log-optimizer-states-to-tensorboard \ - --lr ${LR} \ - --save ${CKPT_DIR} \ - --load ${CKPT_DIR} \ - --seq-length ${SEQ} \ - --num-layers ${NLAYERS} \ - --hidden-size ${HIDDEN} \ - --train-iters ${TRAIN_ITER} \ - --eval-iters ${EVAL_ITERS} \ - --distributed-backend ${BE} \ - --num-attention-heads ${HEADS} \ - --save-interval ${SAVE_INTERVAL} \ - --eval-interval ${EVAL_INTERVAL} \ - --max-position-embeddings ${SEQ} \ - --micro-batch-size ${MICRO_BATCH} \ - --data-file-list ${DATA_FILE_LIST} \ - --tensor-model-parallel-size ${TP} \ - --global-batch-size ${GLOBAL_BATCH} \ - --pipeline-model-parallel-size ${PP} \ - --num-key-value-heads ${NUM_KV_HEAD} \ - --data-cache-path ${DATA_CACHE_PATH} \ - --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ - --tokenizer-model ${TOKENIZER_MODEL} \ - ${LLAMA_ARGS} \ - $ds_args \ - ${gpt_args[*]} \ - $custom_args \ - |& tee ${OUTPUT_LOG} - " - - # --------------------------------------------------- - # --vocab-file $VOCAB_FILE \ - # --merge-file $MERGE_FILE \ - # --lr-decay-iters 320000 \ - # --lr-warmup-iters 5000 \ - # --lr-decay-iters 10000 \ - # --num-workers 4 \ - # launch python3 ${EXEC} \ - # --data-impl mmap \ - # source ./ezpz/src/ezpz/bin/getjobenv || exit - # --------------------------------------------------- - # ${DIST_LAUNCH} ./local_rank.sh python3 ${EXEC} \ - # ${DIST_LAUNCH} python3 ${EXEC} \ - # deepspeed $launcher ${EXEC} \ - # >> ${OUTPUT_LOG} 2>&1 & - # >> ${OUTPUT_LOG} 2>&1 & - # |& tee $OUTPUT_DIR/output.log - # ${EXTRA_ARGS} \ - -echo "! Using $(which deepspeed)" -ds_report - -echo "${run_cmd}" -printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" -printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" -# echo "${OUTPUT_LOG}" -eval "${run_cmd}" -set +x From d243489c1c6b0e41e39e3e81d78b42a04538a849 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 18 Apr 2024 11:27:20 -0500 Subject: [PATCH 184/268] Update README.md --- ALCF/README.md | 463 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 461 insertions(+), 2 deletions(-) diff --git a/ALCF/README.md b/ALCF/README.md index d95c70125d..5f0fd1b6bd 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -1,6 +1,9 @@ # Megatron-DeepSpeed @ ALCF -## Polaris +## ✅ TODOs + +
+TODOs: - [ ] Ensure / double check that optimizer settings from `ds_config.json` aren't being overwritten by some defaults in `megatron/arguments.py` - [ ] specifically, `momentum, beta{1, 2}, etc` @@ -69,6 +72,456 @@
+
+ +## 🏃‍♂️ Running + +> [!NOTE] +> [`train_llama_alcf.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/train_llama_alcf.sh) is the main entry point for launching +> distributed training on {Polaris, Aurora, Sunspot} @ ALCF. + +To launch: + +```bash +$ qsub -A -q debug -l select=2 -l walltime=01:00:00,filesystems=eagle:home -I +$ cd /path/to/Megatron-DeepSpeed/ +# load your favorite {conda, venv} environment, requires: {pytorch, deepspeed} +# e.g. on Polaris: +$ module load conda/2023-10-04 # ; conda activate cu118-pt221 ; unset PYTHONUSERBASE +$ export PBS_O_WORKDIR="$(pwd)" && DATA_FILE_LIST=./convergence_debug_small.txt DTYPE=bf16 OPT=adamw bash train_llama_alcf.sh +``` + +
[output]: + +
+ +
[Sunspot]: + +```bash +# [09:07:32 AM] [foremans@x1921c0s0b0n0] ~/q/llm.devkit/Megatron-DeepSpeed  main !1 ?27 q4-drop 26s ✘ INT +$ PBS_O_WORKDIR=$(pwd) DATA_FILE_LIST=./convergence_debug_small.txt bash train_llama_alcf.sh +source-ing /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/ALCF/helpers.sh +Sourcing /home/foremans/q4-drop_sunspot/llm.devkit/setenv.sh... + UMD: agama-ci-devel-736.9 successfully loaded: + UMD: graphics-compute-runtime/agama-ci-devel-736.9 +Lmod has detected the following error: The following module(s) are unknown: "gcc/12.1.0" + +Please check the spelling or version number. Also try "module spider ..." +It is also possible your cache file is out-of-date; it may help to try: + $ module --ignore_cache load "gcc/12.1.0" + +Also make sure that all modulefiles written in TCL start with the string #%Module + +Note: the module "intel_compute_runtime/release/agama-devel-647" cannot be unloaded because it was not loaded. + +Running on SunSpot !! +[python] Using: /home/foremans/miniconda3/envs/q4-drop/bin/python3 +Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env +Found ezpz! +/lus/gila/projects/Aurora_deployment/foremans/locations/sunspot/projects/saforem2/ezpz/src/ezpz/__init__.py +Has ezpz installed. Nothing to do. +Done with ezpz. +┌─────────────────────────────────────────────────────────────────── +│ Writing PBS vars to /home/foremans/.pbsenv +│ HOSTFILE: /var/spool/pbs/aux/8988430.amn-0001 +│ NHOSTS: 2 +│ NGPU_PER_HOST: 12 GPUs per host +│ NGPUS: 24 GPUs total +└─────────────────────────────────────────────────────────────────── +┌────────────────────────────────────────────────────────────────── +│ [Hosts]: +│ • [host:0] - x1921c0s0b0n0.hostmgmt2000.cm.americas.sgi.com +│ • [host:1] - x1921c0s1b0n0.hostmgmt2000.cm.americas.sgi.com +└────────────────────────────────────────────────────────────────── +┌────────────────────────────────────────────────────────────────── +│ [DIST INFO]: +│ • Loading job env from: /home/foremans/.pbsenv +│ • HOSTFILE: /var/spool/pbs/aux/8988430.amn-0001 +│ • NHOSTS: 2 +│ • NGPU_PER_HOST: 12 +│ • NGPUS (NHOSTS x NGPU_PER_HOST): 24 +│ • WORLD_SIZE: 24 +│ • DIST_LAUNCH: mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/8988430.amn-0001 +└────────────────────────────────────────────────────────────────── +┌────────────────────────────────────────────────────────────────── +│ [Launch]: +│ • Use: 'launch' (=mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/8988430.amn-0001) +│ to launch job +└────────────────────────────────────────────────────────────────── +DS_CONFIG: ds_stage2_mb4_gb96_pp1_bf16.json +ZS: 2, CPU_OPTIMIZER: , MB: 4, GB: 96, PP: 1, DTYPE: bf16!!!Please see logs at logs/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16/0404090742_x1921c0s0b0n0 +!! Caught USE_ACTIVATION_CHECKPOINTING=1 !! +!! Caught USE_ACTIVATION_CHECKPOINTING=1 !! +Calling: `setData()` with ./convergence_debug_small.txt +-------------------- +Updated environment: +DATA_FILE_LIST: ./convergence_debug_small.txt +NUM_DOCS: 15 + WEIGHT_SUM: 15.0 +DFL_STEM: convergence_debug_small +DATA_CACHE_PATH: /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache +-------------------- +++++++++++++++++++++++++++++++++++++++++++++++++++ +- MPICH_DIR= +- Using /home/foremans/miniconda3/envs/q4-drop/bin/python3 +- WORLD_SIZE:24 +- NCCL: nccl +- MODEL_TYPE: llama-seq4096-pp1-tp1-32layers-32heads-4096hidden +- Using DATA_FILE_LIST: ./convergence_debug_small.txt +++++++++++++++++++++++++++++++++++++++++++++++++++ +! Using /home/foremans/miniconda3/envs/q4-drop/bin/deepspeed +/home/foremans/miniconda3/envs/q4-drop/bin/ds_report:4: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html + __import__('pkg_resources').require('deepspeed==0.12.3+6ea44d02') +/home/foremans/miniconda3/envs/q4-drop/lib/python3.9/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: ''If you don't plan on using image function +ality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torch +vision` from source? + warn( +[2024-04-04 09:07:45,585] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to xpu (auto detect) +[2024-04-04 09:07:45,818] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to xpu (auto detect) +-------------------------------------------------- +DeepSpeed C++/CUDA extension op report +-------------------------------------------------- +NOTE: Ops not installed will be just-in-time (JIT) compiled at + runtime if needed. Op compatibility means that your system + meet the required dependencies to JIT install the op. +-------------------------------------------------- +JIT compiled ops requires ninja +ninja .................. [OKAY] +-------------------------------------------------- +op name ................ installed .. compatible +-------------------------------------------------- +async_io ............... [NO] ....... [OKAY] +cpu_adagrad ............ [NO] ....... [OKAY] +cpu_adam ............... [NO] ....... [OKAY] +flash_attn ............. [NO] ....... [OKAY] +fused_adam ............. [NO] ....... [OKAY] +quantizer .............. [NO] ....... [OKAY] +transformer ............ [NO] ....... [OKAY] +transformer_inference .. [NO] ....... [OKAY] +utils .................. [NO] ....... [OKAY] +-------------------------------------------------- +DeepSpeed general environment info: +torch install path ............... ['/home/foremans/miniconda3/envs/q4-drop/lib/python3.9/site-packages/torch'] +torch version .................... 2.1.0a0+cxx11.abi +deepspeed install path ........... ['/lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/DeepSpeed/deepspeed'] +deepspeed info ................... 0.12.3+6ea44d02, 6ea44d02, HEAD +deepspeed wheel compiled w. ...... torch 2.1 +shared memory (/dev/shm) size .... 503.18 GB + + deepspeed --hostfile /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/hostfile_deepspeed --launcher MPICH /lus/gila/projects/Aurora_deployment/ +foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/pretrain_gpt_alcf.py --bf16 --optimizer adamw --split 100,0,0 --log-interval 1 --no-bias-gelu-fusion --lr-decay +-style cosine --no-bias-dropout-fusion --no-masked-softmax-fusion --tokenizer-type Llama2Tokenizer --no-gradient-accumulation-fusion --accumulate-allreduce-grads-in-fp32 + --use-checkpoint-opt_param-scheduler --tensorboard-dir checkpoints/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16/tensorboard --log-timers-to-tensorboard --log-optimizer +-states-to-tensorboard --lr 0.0003 --save checkpoints/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16 --load checkpoints/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16 + --seq-length 4096 --num-layers 32 --hidden-size 4096 --train-iters 317892 --eval-iters 10 --distributed-backend ccl --num-attention-heads 32 --save-interval 20 +0 --eval-interval 50000 --max-position-embeddings 4096 --micro-batch-size 4 --data-file-list ./convergence_debug_small.txt --tensor-model-parallel-size 1 --global-bat +ch-size 96 --pipeline-model-parallel-size 1 --num-key-value-heads 8 --data-cache-path /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/ +.cache/convergence_debug_small/index-cache --ffn-hidden-size 11008 --tokenizer-model /home/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/ALCF/tokenizer.model --no-query- +key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear --deepspeed-activation-checkpointing --z +ero-stage=2 --deepspeed_config=ds_stage2_mb4_gb96_pp1_bf16.json --no-pipeline-parallel --deepspeed --checkpoint-activations --checkpoint-num-layers 1 |& tee logs/ds_stage2 +_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16/0404090742_x1921c0s0b0n0/output.log + +[!! NOTE] View output at: +logs/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16/0404090742_x1921c0s0b0n0/output.log + +# ... + +/gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0051_text_document.bin + creating memory view of numpy buffer... + > finished creating indexed dataset in 0.010017 seconds + number of documents: 1498927 + > dataset split: + train: + document indices in [0, 1498927) total of 1498927 documents + validation: + document indices in [1498927, 1498927) total of 0 documents + test: + document indices in [1498927, 1498927) total of 0 documents + > loading doc-idx mapping from /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/bf90c74a625ac2ee4de6e1d6f7f84fbb_doc_idx.npy + > loading sample-idx mapping from /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/bf90c74a625ac2ee4de6e1d6f7f84fbb_sample_idx.npy + > loading shuffle-idx mapping from /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/bf90c74a625ac2ee4de6e1d6f7f84fbb_shuffle_idx.npy + loaded indexed file in 0.056 seconds + total number of samples: 2318461 + total number of epochs: 8 +> loading blendable dataset index: /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/3a426af74008c22f9db24db811aad6b7_index.npy +> loading blendable dataset sample index: /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/llm.devkit/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/3a426af74008c22f9db24db811aad6b7_sample_index.npy +/home/foremans/miniconda3/envs/q4-drop/lib/python3.9/site-packages/torch/utils/data/dataloader.py:557: UserWarning: This DataLoader will create 2 worker processes in total. Our suggested max number of worker in current system is 1, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary. + +[after dataloaders are built] datetime: 2024-04-04 09:09:27 +done with setup ... +(min, max) time across ranks (ms): + model-and-optimizer-setup ......................: (64818.18, 64858.22) + train/valid/test-data-iterators-setup ..........: (1968.10, 2288.56) +training ... +[before the start of training step] datetime: 2024-04-04 09:09:27 +[2024-04-04 09:09:27,718] [INFO] [checkpointing.py:540:forward] Activation Checkpointing Information +[2024-04-04 09:09:27,719] [INFO] [checkpointing.py:541:forward] ----Partition Activations False, CPU CHECKPOINTING False +[2024-04-04 09:09:27,719] [INFO] [checkpointing.py:542:forward] ----contiguous Memory Checkpointing False with 32 total layers +[2024-04-04 09:09:27,719] [INFO] [checkpointing.py:544:forward] ----Synchronization False +[2024-04-04 09:09:27,719] [INFO] [checkpointing.py:545:forward] ----Profiling time in checkpointing False +[2024-04-04 09:09:33][INFO][utils:145] - Note: detected 208 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable. +[2024-04-04 09:09:33][INFO][utils:148] - Note: NumExpr detected 208 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8. +[2024-04-04 09:09:33][INFO][utils:160] - NumExpr defaulting to 8 threads. +^[c[2024-04-04 09:09:53,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 884.11 | optimizer_gradients: 6.43 | optimizer_step: 23.44 +[2024-04-04 09:09:53,312] [INFO] [logging.py:96:log_dist] [Rank 0] step=1, skipped=0, lr=[0.00029999999999267505, 0.00029999999999267505], mom=[(0.9, 0.999), (0.9, 0.999)] +[2024-04-04 09:09:53,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 6567.68 | bwd_microstep: 17950.36 | bwd_inner_microstep: 17711.20 | bwd_allreduce_microstep: 239.11 | step_microstep: 1139.27 +[2024-04-04 09:09:53,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 6567.66 | bwd: 17950.35 | bwd_inner: 17711.19 | bwd_allreduce: 239.11 | step: 1139.29 +[Rank 0] (after 1 iterations) memory (MB) | allocated: 18244.640625 | max allocated: 41299.50146484375 | reserved: 46764.0 | max reserved: 46764.0 + iteration 1/ 317892 | consumed samples: 96 | consumed tokens: 393216 | elapsed time per iteration (ms): 25849.1 | learning rate: 3.000E-04 | global batch size: 96 | lm loss: 1.117136E+01 | loss scale: 1.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 3.714 | tokens per gpu per second(tgs): 633.832 | TFLOPs: 38.61 | +[2024-04-04 09:10:13,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 327.85 | optimizer_gradients: 6.26 | optimizer_step: 23.60 +[2024-04-04 09:10:13,619] [INFO] [logging.py:96:log_dist] [Rank 0] step=2, skipped=0, lr=[0.00029999999997070033, 0.00029999999997070033], mom=[(0.9, 0.999), (0.9, 0.999)] +[2024-04-04 09:10:13,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 4022.74 | bwd_microstep: 15738.67 | bwd_inner_microstep: 15556.80 | bwd_allreduce_microstep: 181.82 | step_microstep: 371.01 +[2024-04-04 09:10:13,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 4022.73 | bwd: 15738.66 | bwd_inner: 15556.62 | bwd_allreduce: 181.81 | step: 371.02 + iteration 2/ 317892 | consumed samples: 192 | consumed tokens: 786432 | elapsed time per iteration (ms): 20298.3 | learning rate: 3.000E-04 | global batch size: 96 | lm loss: 2.537718E+01 | loss scale: 1.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 4.729 | tokens per gpu per second(tgs): 807.159 | TFLOPs: 49.17 | +``` + +
+ +
[Polaris]: + +```bash +[09:31:35 AM] [foremans@x3112c0s13b0n0] ~/pol/p/a/Megatron-DeepSpeed  main !4 ?24 cu118-pt221 ✘ INT +$ export PBS_O_WORKDIR="$(pwd)" && DATA_FILE_LIST=./convergence_debug_small.txt DTYPE=bf16 OPT=adamw bash train_llama_alcf.sh +source-ing /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/ALCF/helpers.sh +Running on Polaris !! + +[python] Using: /eagle/datascience/foremans/miniconda3/envs/cu118-pt221/bin/python3 +Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env +Found ezpz! +/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/ezpz/src/ezpz/__init__.py +Has ezpz installed. Nothing to do. +Done with ezpz. +┌─────────────────────────────────────────────────────────────────── +│ Writing PBS vars to /home/foremans/.pbsenv +│ HOSTFILE: /var/spool/pbs/aux/1822297.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov +│ NHOSTS: 2 +│ NGPU_PER_HOST: 4 GPUs per host +│ NGPUS: 8 GPUs total +└─────────────────────────────────────────────────────────────────── +┌────────────────────────────────────────────────────────────────── +│ [Hosts]: +│ • [host:0] - x3112c0s13b0n0.hsn.cm.polaris.alcf.anl.gov +│ • [host:1] - x3112c0s13b1n0.hsn.cm.polaris.alcf.anl.gov +└────────────────────────────────────────────────────────────────── +┌────────────────────────────────────────────────────────────────── +│ [DIST INFO]: +│ • Loading job env from: /home/foremans/.pbsenv +│ • HOSTFILE: /var/spool/pbs/aux/1822297.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov +│ • NHOSTS: 2 +│ • NGPU_PER_HOST: 4 +│ • NGPUS (NHOSTS x NGPU_PER_HOST): 8 +│ • WORLD_SIZE: 8 +│ • DIST_LAUNCH: mpiexec --verbose --envall -n 8 -ppn 4 --hostfile /var/spool/pbs/aux/1822297.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov +└────────────────────────────────────────────────────────────────── +┌────────────────────────────────────────────────────────────────── +│ [Launch]: +│ • Use: 'launch' (=mpiexec --verbose --envall -n 8 -ppn 4 --hostfile /var/spool/pbs/aux/1822297.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov) +│ to launch job +└────────────────────────────────────────────────────────────────── +DS_CONFIG: ds_stage2_mb8_gb32_pp1_bf16.json +ZS: 2, CPU_OPTIMIZER: , MB: 8, GB: 32, PP: 1, DTYPE: bf16!!!Please see logs at logs/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16/0404093534_x3112c0s13b0n0 +!! Caught USE_ACTIVATION_CHECKPOINTING=1 !! +!! Caught USE_ACTIVATION_CHECKPOINTING=1 !! +Calling: setData() with ./convergence_debug_small.txt +-------------------- +Updated environment: +DATA_FILE_LIST: ./convergence_debug_small.txt +NUM_DOCS: 15 + WEIGHT_SUM: 15.0 +DFL_STEM: convergence_debug_small +DATA_CACHE_PATH: /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache +-------------------- +++++++++++++++++++++++++++++++++++++++++++++++++++ +- MPICH_DIR=/opt/cray/pe/mpich/8.1.25/ofi/gnu/9.1 +- Using /eagle/datascience/foremans/miniconda3/envs/cu118-pt221/bin/python3 +- WORLD_SIZE:8 +- NCCL: nccl +- MODEL_TYPE: llama-seq4096-pp1-tp2-32layers-32heads-4096hidden +- Using DATA_FILE_LIST: ./convergence_debug_small.txt +++++++++++++++++++++++++++++++++++++++++++++++++++ +! Using /eagle/datascience/foremans/miniconda3/envs/cu118-pt221/bin/deepspeed +[2024-04-04 09:35:35,959] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +-------------------------------------------------- +DeepSpeed C++/CUDA extension op report +-------------------------------------------------- +NOTE: Ops not installed will be just-in-time (JIT) compiled at + runtime if needed. Op compatibility means that your system + meet the required dependencies to JIT install the op. +-------------------------------------------------- +JIT compiled ops requires ninja +ninja .................. [OKAY] +-------------------------------------------------- +op name ................ installed .. compatible +-------------------------------------------------- +async_io ............... [NO] ....... [OKAY] +fused_adam ............. [NO] ....... [OKAY] +cpu_adam ............... [NO] ....... [OKAY] +cpu_adagrad ............ [NO] ....... [OKAY] +cpu_lion ............... [NO] ....... [OKAY] + [WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH +evoformer_attn ......... [NO] ....... [NO] +fused_lamb ............. [NO] ....... [OKAY] +fused_lion ............. [NO] ....... [OKAY] +inference_core_ops ..... [NO] ....... [OKAY] +cutlass_ops ............ [NO] ....... [OKAY] +transformer_inference .. [NO] ....... [OKAY] +quantizer .............. [NO] ....... [OKAY] +ragged_device_ops ...... [NO] ....... [OKAY] +ragged_ops ............. [NO] ....... [OKAY] +random_ltd ............. [NO] ....... [OKAY] + [WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.2 + [WARNING] using untested triton version (2.2.0), only 1.0.0 is known to be compatible +sparse_attn ............ [NO] ....... [NO] +spatial_inference ...... [NO] ....... [OKAY] +transformer ............ [NO] ....... [OKAY] +stochastic_transformer . [NO] ....... [OKAY] +-------------------------------------------------- +DeepSpeed general environment info: +torch install path ............... ['/eagle/datascience/foremans/miniconda3/envs/cu118-pt221/lib/python3.12/site-packages/torch'] +torch version .................... 2.2.1 +deepspeed install path ........... ['/eagle/datascience/foremans/miniconda3/envs/cu118-pt221/lib/python3.12/site-packages/deepspeed'] +deepspeed info ................... 0.14.0, unknown, unknown +torch cuda version ............... 11.8 +torch hip version ................ None +nvcc version ..................... 11.8 +deepspeed wheel compiled w. ...... torch 2.2, cuda 11.8 +shared memory (/dev/shm) size .... 251.61 GB + + deepspeed --hostfile /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/hostfile_deepspeed --launcher MPICH /lus/eagle/projects/datascienc +e/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/pretrain_gpt_alcf.py --bf16 --optimizer adamw --split 100,0,0 --log-interval 1 --no-bias-gelu-fusion + --lr-decay-style cosine --no-bias-dropout-fusion --no-masked-softmax-fusion --tokenizer-type Llama2Tokenizer --no-gradient-accumulation-fusion --accumulate-allreduce- +grads-in-fp32 --use-checkpoint-opt_param-scheduler --tensorboard-dir checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16/tensorboard --log-timers-to-tensorboard - +-log-optimizer-states-to-tensorboard --lr 0.0003 --save checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16 --load checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_ +pp1_tp2_bf16 --seq-length 4096 --num-layers 32 --hidden-size 4096 --train-iters 317892 --eval-iters 10 --distributed-backend nccl --num-attention-heads 32 --s +ave-interval 200 --eval-interval 50000 --max-position-embeddings 4096 --micro-batch-size 8 --data-file-list ./convergence_debug_small.txt --tensor-model-parallel-size 2 + --global-batch-size 32 --pipeline-model-parallel-size 1 --num-key-value-heads 8 --data-cache-path /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-l +cf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache --ffn-hidden-size 11008 --tokenizer-model /home/foremans/polaris/projects/argonne-lcf/Megatron-DeepSpeed/ALCF/tokeniz +er.model --no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear --use-flash-attn-v2 + --deepspeed-activation-checkpointing --zero-stage=2 --deepspeed_config=ds_stage2_mb8_gb32_pp1_bf16.json --no-pipeline-parallel --deepspeed --checkpoint-activations --checkpoint- +num-layers 1 |& tee logs/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16/0404093534_x3112c0s13b0n0/output.log + +[!! NOTE] View output at: +logs/ds_stage2_nl32_hs4096_mb8_seq4096_gb32_pp1_tp2_bf16/0404093534_x3112c0s13b0n0/output.log + +# ... + +/eagle/datasets/dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0051_text_document.bin + creating memory view of numpy buffer... + > finished creating indexed dataset in 0.001280 seconds + number of documents: 1498927 + > dataset split: + train: + document indices in [0, 1498927) total of 1498927 documents + validation: + document indices in [1498927, 1498927) total of 0 documents + test: + document indices in [1498927, 1498927) total of 0 documents + > loading doc-idx mapping from /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/9217d94f3290abc2fddf9e87bff236d6_doc_idx.npy + > loading sample-idx mapping from /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/9217d94f3290abc2fddf9e87bff236d6_sample_idx.npy + > loading shuffle-idx mapping from /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/9217d94f3290abc2fddf9e87bff236d6_shuffle_idx.npy + loaded indexed file in 0.004 seconds + total number of samples: 869423 + total number of epochs: 3 +> loading blendable dataset index: /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/a815d51f6752c6f486d94194ce95fb87_index.npy +> loading blendable dataset sample index: /lus/eagle/projects/datascience/foremans/locations/polaris/projects/argonne-lcf/Megatron-DeepSpeed/.cache/convergence_debug_small/index-cache/a815d51f6752c6f486d94194ce95fb87_sample_index.npy +> size of blendable dataset: 10223415 samples +> finished creating GPT datasets ... +[after dataloaders are built] datetime: 2024-04-04 09:36:07 +done with setup ... +(min, max) time across ranks (ms): + model-and-optimizer-setup ......................: (4794.78, 4795.23) + train/valid/test-data-iterators-setup ..........: (589.69, 721.20) +training ... +[before the start of training step] datetime: 2024-04-04 09:36:07 +[2024-04-04 09:36:07,407] [INFO] [checkpointing.py:539:forward] Activation Checkpointing Information +[2024-04-04 09:36:07,407] [INFO] [checkpointing.py:540:forward] ----Partition Activations False, CPU CHECKPOINTING False +[2024-04-04 09:36:07,407] [INFO] [checkpointing.py:541:forward] ----contiguous Memory Checkpointing False with 32 total layers +[2024-04-04 09:36:07,407] [INFO] [checkpointing.py:543:forward] ----Synchronization False +[2024-04-04 09:36:07,407] [INFO] [checkpointing.py:544:forward] ----Profiling time in checkpointing False +[2024-04-04 09:36:28,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1626.54 | optimizer_gradients: 19.29 | optimizer_step: 419.48 +[2024-04-04 09:36:28,430] [INFO] [logging.py:96:log_dist] [Rank 0] step=1, skipped=0, lr=[0.00029999999999267505, 0.00029999999999267505], mom=[(0.9, 0.999), (0.9, 0.999)] +[2024-04-04 09:36:28,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 11336.34 | bwd_microstep: 7134.73 | bwd_inner_microstep: 7090.02 | bwd_allreduce_microstep: 44.65 | step_microstep: 2564.02 +[2024-04-04 09:36:28,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 11336.33 | bwd: 7134.75 | bwd_inner: 7090.01 | bwd_allreduce: 44.66 | step: 2564.02 + iteration 1/ 317892 | consumed samples: 32 | consumed tokens: 131072 | elapsed time per iteration (ms): 21133.8 | learning rate: 3.000E-04 | global batch size: 32 | lm loss: 1.119983E+01 | loss scale: 1.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1.514 | tokens per gpu per second(tgs): 775.250 | TFLOPs: 47.23 | +[Rank 1] (after 1 iterations) memory (MB) | allocated: 14165.525390625 | max allocated: 22332.37255859375 | reserved: 24642.0 | max reserved: 35824.0 +[Rank 0] (after 1 iterations) memory (MB) | allocated: 14165.525390625 | max allocated: 22332.37255859375 | reserved: 24642.0 | max reserved: 32994.0 +[2024-04-04 09:36:38,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 1605.55 | optimizer_gradients: 11.56 | optimizer_step: 50.92 +[2024-04-04 09:36:38,623] [INFO] [logging.py:96:log_dist] [Rank 0] step=2, skipped=0, lr=[0.00029999999997070033, 0.00029999999997070033], mom=[(0.9, 0.999), (0.9, 0.999)] +[2024-04-04 09:36:38,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1395.17 | bwd_microstep: 6832.48 | bwd_inner_microstep: 6789.73 | bwd_allreduce_microstep: 42.70 | step_microstep: 1867.64 +[2024-04-04 09:36:38,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 1395.15 | bwd: 6832.49 | bwd_inner: 6789.73 | bwd_allreduce: 42.71 | step: 1867.65 + iteration 2/ 317892 | consumed samples: 64 | consumed tokens: 262144 | elapsed time per iteration (ms): 10154.3 | learning rate: 3.000E-04 | global batch size: 32 | lm loss: 1.766422E+01 | loss scale: 1.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 3.151 | tokens per gpu per second(tgs): 1613.503 | TFLOPs: 98.29 | + +# ... +``` + +
+ +
+ +## 📦 Install + +
Install Instructions + +1. Clone [`argonne-lcf/Megatron-DeepSpeed`](https://github.com/argonne-lcf/Megatron-DeepSpeed) + + ```bash + $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed + $ cd Megatron-DeepSpeed + ``` + + > [!NOTE] + > In the `conda create` command below, + > you can replace `--name "${DAY}"` with + > `--prefix /path/to/your/conda/envs`, if you prefer: + +2. Create `conda` env: + + ```bash + $ module load conda/2023-10-04 + $ export MPICC="cc -shared -taret-accel=nvidia80" + $ export DAY=$(date "+%Y-%m-%d") + $ export PYTHONUSERBASE="${HOME}/.local/polaris/conda/${DAY}" + $ conda create --solver libmamba -c pytorch -c nvidia --name "${DAY}" "python==3.12" + ``` + +3. Install dependencies: + + ```bash + $ conda activate "${DAY}" # e.g. 2024-03-07 + $ conda install -c pytorch -c nvidia --solver libmamba mpi4py ninja transformers xformers triton pytorch torchvision torchaudio pytorch-cuda=11.8 + $ conda install --solver libmamba mpi4py -c conda-forge -c pytorch -c nvidia + $ python3 -m pip install --upgrade pip pybind11 toolong appdirs wandb sentencepiece ipython setuptools wheel ninja + $ python3 -m pip install --upgrade deepspeed wandb + ``` + + - [`ezpz`](https://github.com/saforem2/ezpz): + +
install: + + ```bash + $ git clone https://github.com/saforem2/ezpz + $ python3 -m pip install -e "ezpz[dev]" + ``` + +
+ + - [**OPTIONAL**] [`NVIDIA/apex`](https://github.com/NVIDIA/apex): + +
install: + + ```bash + $ git clone https://github.com/NVIDIA/apex + $ cd apex + # NOTE: need GCC < 11 for APEX ¯\_(ツ)_/¯ ?? + $ module swap gcc gcc/10.3.0 + $ python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ + ``` + +
+ +
+ + +## 📝 Data Preprocessing + +
Data Pre-Processing: + AuroraGPT is trained on the Dolma dataset (initially v0), now in the process of moving to v6. For more details on the dataset, refer to https://huggingface.co/datasets/allenai/dolma. The dolma dataset downloaded is already preprocessing to remove the duplicates (dedup) and filtering the data (mixing). For more details refer to https://github.com/allenai/dolma/tree/main/docs and https://github.com/vksastry/dolma_alcf/blob/main/ALCF/Readme.md. The data preprocessing of Dolma dataset before training consists of tokenization of the data using a specific tokenizer (LlamaTokenizer is what we are currently using), Use the below script to tokenize the entire dataset. Example shown for Polaris. @@ -207,3 +664,5 @@ The data preprocessing of Dolma dataset before training consists of tokenization cd /eagle/datasets/dolma/utils ./tokenization.sh ``` + +
From 78f378539eb4932a82dced1cfb8262ee608eca28 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 18 Apr 2024 11:45:49 -0500 Subject: [PATCH 185/268] Update README.md --- ALCF/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ALCF/README.md b/ALCF/README.md index 5f0fd1b6bd..d405986fdf 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -152,7 +152,7 @@ DS_CONFIG: ds_stage2_mb4_gb96_pp1_bf16.json ZS: 2, CPU_OPTIMIZER: , MB: 4, GB: 96, PP: 1, DTYPE: bf16!!!Please see logs at logs/ds_stage2_nl32_hs4096_mb4_seq4096_gb96_pp1_tp1_bf16/0404090742_x1921c0s0b0n0 !! Caught USE_ACTIVATION_CHECKPOINTING=1 !! !! Caught USE_ACTIVATION_CHECKPOINTING=1 !! -Calling: `setData()` with ./convergence_debug_small.txt +Calling: setData() with ./convergence_debug_small.txt -------------------- Updated environment: DATA_FILE_LIST: ./convergence_debug_small.txt @@ -172,7 +172,7 @@ DATA_CACHE_PATH: /lus/gila/projects/Aurora_deployment/foremans/q4-drop_sunspot/l ! Using /home/foremans/miniconda3/envs/q4-drop/bin/deepspeed /home/foremans/miniconda3/envs/q4-drop/bin/ds_report:4: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html __import__('pkg_resources').require('deepspeed==0.12.3+6ea44d02') -/home/foremans/miniconda3/envs/q4-drop/lib/python3.9/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: ''If you don't plan on using image function +/home/foremans/miniconda3/envs/q4-drop/lib/python3.9/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: ''If you dont plan on using image function ality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torch vision` from source? warn( From 2dc5aebb1eb4345c1eab32d8247a813c7c6bba98 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 18 Apr 2024 12:05:35 -0500 Subject: [PATCH 186/268] Add `ALCF/data-lists/polaris/*.txt` --- ALCF/data-lists/polaris/data_file_list_c4.txt | 86 + ALCF/data-lists/polaris/data_file_list_cc.txt | 2878 +++++++++++ .../data_file_list_gutenberg-books.txt | 3 + .../polaris/data_file_list_peS2o.txt | 42 + .../polaris/data_file_list_stack-code.txt | 4435 +++++++++++++++++ .../polaris/data_file_list_wiki-en-simple.txt | 2 + 6 files changed, 7446 insertions(+) create mode 100644 ALCF/data-lists/polaris/data_file_list_c4.txt create mode 100644 ALCF/data-lists/polaris/data_file_list_cc.txt create mode 100644 ALCF/data-lists/polaris/data_file_list_gutenberg-books.txt create mode 100644 ALCF/data-lists/polaris/data_file_list_peS2o.txt create mode 100644 ALCF/data-lists/polaris/data_file_list_stack-code.txt create mode 100644 ALCF/data-lists/polaris/data_file_list_wiki-en-simple.txt diff --git a/ALCF/data-lists/polaris/data_file_list_c4.txt b/ALCF/data-lists/polaris/data_file_list_c4.txt new file mode 100644 index 0000000000..9ff6f90ff9 --- /dev/null +++ b/ALCF/data-lists/polaris/data_file_list_c4.txt @@ -0,0 +1,86 @@ +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0012_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0001_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0073_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0045_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0084_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0065_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0032_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0085_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0064_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0025_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0042_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0055_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0023_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0028_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0036_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0037_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0016_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0000_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0019_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0046_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0059_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0017_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0072_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0033_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0006_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0061_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0071_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0057_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0011_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0047_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0004_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0009_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0070_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0018_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0054_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0049_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0003_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0021_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0051_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0074_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0027_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0050_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0079_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0022_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0030_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0034_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0020_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0035_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0015_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0066_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0044_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0010_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0002_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0041_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0067_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0048_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0013_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0083_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0053_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0008_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0014_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0069_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0056_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0062_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0031_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0007_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0077_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0058_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0076_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0078_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0005_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0081_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0040_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0068_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0075_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0063_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0029_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0039_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0026_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0052_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0024_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0043_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0060_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0082_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0080_text_document +0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0038_text_document diff --git a/ALCF/data-lists/polaris/data_file_list_cc.txt b/ALCF/data-lists/polaris/data_file_list_cc.txt new file mode 100644 index 0000000000..4a20a99b89 --- /dev/null +++ b/ALCF/data-lists/polaris/data_file_list_cc.txt @@ -0,0 +1,2878 @@ +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0553_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0299_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0366_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0753_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0429_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0372_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0124_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0437_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0053_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0615_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0182_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0713_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0688_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0166_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0768_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0692_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0041_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0416_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0630_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0639_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0225_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0035_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0365_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0368_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0196_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0328_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0624_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0081_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0488_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0189_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0118_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0150_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0314_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0209_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0229_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0265_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0532_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0478_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0140_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0256_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0047_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0607_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0023_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0111_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0613_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0748_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0000_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0127_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0106_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0563_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0577_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0502_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0705_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0538_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0088_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0263_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0460_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0571_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0653_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0172_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0524_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0652_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0322_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0447_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0387_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0612_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0290_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0339_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0487_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0396_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0178_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0091_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0193_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0408_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0496_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0755_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0773_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0547_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0384_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0574_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0533_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0464_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0489_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0050_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0060_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0114_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0033_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0561_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0208_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0233_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0744_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0326_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0313_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0482_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0436_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0588_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0080_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0660_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0038_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0282_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0745_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0406_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0116_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0059_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0503_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0357_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0171_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0770_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0286_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0544_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0698_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0155_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0341_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0463_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0051_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0676_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0595_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0174_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0198_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0480_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0687_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0145_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0004_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0583_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0449_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0204_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0715_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0521_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0320_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0568_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0151_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0197_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0709_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0499_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0006_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0269_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0525_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0413_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0656_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0646_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0246_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0535_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0333_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0238_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0241_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0469_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0689_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0403_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0404_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0360_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0191_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0236_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0032_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0445_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0614_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0490_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0651_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0703_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0702_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0623_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0719_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0728_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0031_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0253_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0551_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0327_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0027_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0491_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0395_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0473_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0662_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0312_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0605_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0455_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0580_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0005_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0311_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0305_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0260_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0566_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0670_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0129_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0742_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0549_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0058_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0501_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0071_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0450_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0375_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0131_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0697_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0415_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0560_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0643_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0699_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0515_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0739_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0092_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0046_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0083_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0443_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0746_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0655_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0427_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0603_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0367_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0318_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0520_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0749_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0771_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0369_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0434_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0602_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0349_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0763_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0731_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0338_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0462_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0347_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0649_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0194_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0134_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0734_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0632_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0280_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0184_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0089_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0095_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0555_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0016_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0168_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0665_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0767_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0666_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0737_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0037_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0648_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0064_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0764_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0323_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0009_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0545_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0212_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0015_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0267_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0727_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0661_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0211_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0220_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0278_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0721_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0718_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0207_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0619_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0400_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0754_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0610_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0358_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0758_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0298_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0756_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0729_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0468_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0397_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0247_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0149_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0119_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0010_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0093_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0386_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0045_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0066_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0393_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0600_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0440_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0350_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0214_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0714_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0161_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0775_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0203_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0077_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0332_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0700_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0123_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0024_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0013_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0587_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0148_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0513_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0674_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0188_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0599_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0158_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0425_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0003_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0534_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0254_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0121_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0099_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0373_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0479_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0379_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0344_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0684_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0720_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0391_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0575_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0319_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0336_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0531_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0474_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0432_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0766_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0342_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0476_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0237_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0061_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0250_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0752_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0329_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0376_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0640_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0634_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0682_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0181_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0076_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0244_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0690_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0303_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0228_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0477_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0224_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0199_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0343_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0399_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0707_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0760_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0774_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0270_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0144_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0451_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0180_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0025_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0363_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0516_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0647_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0581_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0679_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0635_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0201_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0133_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0351_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0325_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0183_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0287_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0683_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0316_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0275_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0424_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0461_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0576_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0390_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0052_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0086_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0492_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0216_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0772_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0439_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0249_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0493_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0593_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0442_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0218_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0484_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0346_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0157_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0352_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0441_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0486_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0537_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0485_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0164_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0022_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0458_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0497_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0170_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0154_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0751_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0048_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0428_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0418_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0112_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0757_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0421_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0471_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0510_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0466_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0641_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0601_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0740_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0594_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0276_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0383_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0232_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0717_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0644_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0518_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0743_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0673_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0044_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0667_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0308_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0675_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0572_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0579_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0723_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0381_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0759_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0504_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0708_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0049_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0642_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0074_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0039_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0401_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0409_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0014_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0098_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0146_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0616_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0101_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0446_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0565_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0295_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0730_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0498_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0638_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0301_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0139_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0192_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0001_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0268_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0527_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0359_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0315_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0251_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0546_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0262_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0659_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0567_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0190_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0078_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0175_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0054_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0008_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0452_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0187_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0011_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0138_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0087_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0206_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0611_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0509_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0205_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0620_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0677_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0132_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0296_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0495_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0444_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0598_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0691_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0761_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0417_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0317_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0578_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0374_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0055_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0481_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0307_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0736_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0136_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0550_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0084_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0511_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0380_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0356_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0310_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0110_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0668_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0306_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0115_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0324_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0202_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0294_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0704_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0629_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0608_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0627_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0725_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0472_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0230_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0407_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0556_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0505_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0040_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0606_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0096_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0281_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0179_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0557_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0288_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0769_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0370_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0017_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0694_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0385_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0130_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0562_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0506_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0036_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0217_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0289_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0712_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0724_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0564_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0105_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0120_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0141_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0431_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0142_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0570_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0512_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0227_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0411_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0389_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0735_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0585_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0122_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0042_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0309_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0765_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0636_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0539_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0467_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0586_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0750_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0200_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0671_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0530_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0012_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0082_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0160_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0438_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0195_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0185_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0215_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0173_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0710_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0348_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0590_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0073_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0030_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0079_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0072_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0019_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0239_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0410_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0453_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0543_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0007_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0733_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0125_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0569_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0331_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0062_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0043_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0433_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0235_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0448_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0696_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0559_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0392_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0664_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0483_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0591_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0272_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0271_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0340_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0159_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0153_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0541_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0028_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0067_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0222_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0165_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0117_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0669_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0103_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0258_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0097_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0419_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0459_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0609_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0104_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0430_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0582_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0457_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0529_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0029_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0426_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0279_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0596_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0631_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0517_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0507_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0252_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0626_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0056_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0335_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0542_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0672_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0284_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0257_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0654_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0678_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0018_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0528_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0422_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0068_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0245_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0617_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0255_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0176_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0732_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0221_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0371_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0137_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0177_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0382_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0291_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0597_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0321_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0292_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0273_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0716_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0094_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0026_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0021_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0242_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0226_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0100_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0519_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0552_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0057_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0776_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0109_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0219_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0706_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0508_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0722_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0456_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0633_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0618_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0354_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0514_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0475_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0034_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0300_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0454_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0693_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0658_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0304_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0090_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0548_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0063_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0465_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0414_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0169_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0762_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0701_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0135_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0143_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0167_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0523_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0540_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0334_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0297_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0738_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0500_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0554_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0695_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0747_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0108_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0113_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0686_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0302_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0362_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0355_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0645_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0102_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0681_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0536_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0156_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0163_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0277_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0573_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0293_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0741_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0377_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0423_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0592_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0657_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0584_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0628_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0240_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0361_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0680_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0388_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0435_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0345_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0637_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0420_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0378_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0398_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0234_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0266_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0210_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0711_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0162_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0264_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0070_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0223_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0470_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0002_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0069_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0625_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0285_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0065_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0147_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0085_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0394_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0330_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0621_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0248_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0274_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0526_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0685_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0589_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0128_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0405_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0126_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0075_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0494_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0283_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0558_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0402_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0261_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0107_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0522_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0604_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0231_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0663_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0726_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0152_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0353_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0243_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0364_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0213_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0412_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0622_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0650_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0354_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0200_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0391_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0559_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0473_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0528_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0073_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0170_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0180_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0582_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0067_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0036_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0023_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0099_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0342_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0353_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0044_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0575_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0107_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0566_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0101_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0542_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0217_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0437_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0583_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0172_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0060_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0072_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0393_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0414_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0355_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0082_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0281_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0051_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0133_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0471_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0346_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0034_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0300_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0282_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0309_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0125_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0434_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0460_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0186_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0504_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0487_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0132_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0225_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0401_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0477_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0607_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0562_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0497_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0174_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0569_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0591_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0110_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0455_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0166_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0453_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0581_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0341_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0544_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0606_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0291_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0086_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0578_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0083_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0357_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0188_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0411_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0218_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0003_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0001_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0543_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0307_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0169_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0149_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0561_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0310_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0222_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0547_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0183_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0214_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0111_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0127_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0054_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0007_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0572_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0103_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0334_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0114_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0513_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0237_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0397_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0306_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0120_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0129_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0262_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0459_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0085_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0206_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0271_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0610_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0031_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0390_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0043_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0012_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0409_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0574_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0596_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0588_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0532_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0236_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0501_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0269_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0540_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0175_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0290_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0233_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0440_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0071_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0037_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0420_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0404_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0141_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0592_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0164_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0162_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0388_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0159_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0372_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0476_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0323_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0008_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0151_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0533_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0344_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0481_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0204_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0179_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0496_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0469_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0055_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0367_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0277_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0603_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0512_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0340_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0143_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0140_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0285_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0124_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0531_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0375_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0013_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0522_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0066_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0332_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0228_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0445_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0430_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0018_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0392_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0505_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0485_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0130_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0026_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0489_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0006_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0157_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0467_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0454_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0600_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0163_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0248_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0339_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0534_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0038_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0597_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0303_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0425_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0000_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0352_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0226_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0461_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0545_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0102_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0587_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0048_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0336_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0502_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0427_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0090_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0369_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0216_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0292_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0243_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0326_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0602_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0611_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0499_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0032_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0599_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0097_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0182_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0378_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0509_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0139_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0456_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0322_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0221_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0153_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0076_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0057_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0284_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0075_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0422_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0288_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0077_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0305_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0273_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0242_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0050_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0525_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0232_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0173_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0294_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0016_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0365_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0604_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0138_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0178_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0377_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0042_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0065_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0112_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0142_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0184_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0495_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0276_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0301_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0210_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0494_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0447_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0333_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0424_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0224_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0105_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0081_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0579_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0536_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0168_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0293_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0021_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0230_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0260_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0450_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0465_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0394_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0319_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0028_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0608_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0538_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0155_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0517_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0240_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0515_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0158_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0209_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0321_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0296_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0576_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0080_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0091_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0259_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0121_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0580_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0595_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0062_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0436_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0337_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0059_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0115_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0412_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0462_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0106_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0423_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0366_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0215_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0263_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0295_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0443_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0557_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0010_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0136_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0376_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0235_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0135_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0144_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0548_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0537_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0128_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0287_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0194_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0069_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0324_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0364_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0187_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0096_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0558_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0063_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0551_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0286_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0449_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0255_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0358_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0383_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0283_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0470_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0403_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0349_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0524_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0426_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0486_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0519_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0380_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0387_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0122_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0518_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0554_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0027_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0418_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0104_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0039_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0268_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0201_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0094_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0347_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0416_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0514_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0231_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0330_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0523_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0570_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0421_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0009_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0478_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0417_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0482_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0213_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0568_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0312_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0550_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0219_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0093_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0530_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0035_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0356_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0406_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0498_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0297_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0266_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0428_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0074_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0244_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0431_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0410_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0134_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0246_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0408_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0563_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0360_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0468_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0402_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0370_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0148_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0092_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0089_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0251_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0061_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0609_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0539_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0475_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0261_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0304_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0203_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0084_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0395_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0131_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0197_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0087_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0327_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0279_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0196_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0555_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0041_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0317_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0193_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0441_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0202_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0511_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0220_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0474_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0577_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0014_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0171_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0361_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0264_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0024_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0432_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0451_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0552_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0584_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0239_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0458_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0407_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0045_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0541_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0371_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0480_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0590_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0015_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0108_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0320_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0145_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0483_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0521_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0419_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0150_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0526_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0589_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0119_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0315_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0546_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0510_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0373_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0413_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0249_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0484_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0493_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0491_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0385_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0005_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0374_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0185_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0345_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0571_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0167_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0234_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0318_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0520_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0256_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0116_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0088_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0556_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0302_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0238_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0205_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0019_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0191_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0199_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0078_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0594_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0195_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0030_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0439_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0448_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0350_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0267_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0275_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0348_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0560_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0181_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0329_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0516_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0363_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0258_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0359_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0299_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0457_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0379_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0049_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0368_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0265_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0046_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0311_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0177_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0058_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0040_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0549_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0605_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0160_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0472_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0020_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0553_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0211_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0052_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0466_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0382_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0351_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0433_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0270_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0593_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0529_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0095_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0308_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0152_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0064_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0189_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0527_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0070_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0400_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0257_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0229_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0154_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0362_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0573_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0161_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0252_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0386_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0280_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0585_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0464_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0338_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0278_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0506_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0033_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0137_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0444_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0247_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0109_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0004_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0011_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0508_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0126_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0017_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0254_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0567_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0100_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0398_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0117_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0147_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0176_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0156_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0490_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0022_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0190_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0047_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0207_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0446_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0227_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0435_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0289_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0146_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0598_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0503_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0165_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0208_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0025_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0442_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0325_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0429_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0056_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0212_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0002_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0245_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0068_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0381_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0452_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0500_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0396_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0328_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0384_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0389_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0586_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0488_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0298_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0463_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0118_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0479_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0274_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0272_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0492_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0113_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0415_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0405_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0198_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0313_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0331_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0314_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0053_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0507_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0438_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0029_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0399_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0564_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0601_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0241_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0343_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0223_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0316_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0123_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0535_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0250_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0335_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0253_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0079_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0565_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0192_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0098_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0247_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1166_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1192_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0818_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0166_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0529_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1356_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0858_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0823_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1339_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0627_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1370_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0341_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0185_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1259_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0981_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0515_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0545_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1167_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0077_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0665_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1195_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0085_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0566_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0230_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0311_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0055_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0810_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1270_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0966_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0517_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0843_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0348_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0797_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0736_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0943_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1054_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1105_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0556_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0849_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0492_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1121_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0817_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0874_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0579_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1250_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0146_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0589_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0169_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1084_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1041_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0526_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0551_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0193_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1402_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1106_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0782_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0659_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0588_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0990_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0833_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0845_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1098_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0402_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0878_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0930_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0046_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0440_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1293_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0393_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0049_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0305_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0868_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1126_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0531_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1087_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1442_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0997_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0366_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0165_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1078_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0957_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1002_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0269_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0460_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1397_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0250_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0951_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1246_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0876_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0302_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0564_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0584_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0622_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0694_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0335_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1189_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0215_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1390_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0204_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1038_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0713_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0567_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1130_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0221_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0538_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1232_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1265_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0628_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0090_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0968_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0248_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0885_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0977_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0749_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0527_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0985_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0934_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0993_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0088_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0674_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0171_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1160_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0640_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1419_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0488_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0704_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0887_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1055_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1258_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0924_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0390_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0612_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1276_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0744_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0399_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0053_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0025_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0371_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0161_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1444_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0051_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0367_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0036_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1398_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0434_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0965_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1226_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0093_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0702_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0949_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1343_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1480_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0503_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0979_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1039_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1034_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0725_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1185_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1288_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0742_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0242_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1071_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0975_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1211_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0435_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0841_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1303_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1380_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0558_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0522_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0181_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0098_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0318_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1101_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1183_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0054_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0962_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0072_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1283_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1269_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0379_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0109_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1257_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0683_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0932_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0724_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0259_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0752_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0748_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1456_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0038_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1311_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0224_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1181_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0292_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0006_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1077_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1376_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0082_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1107_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1305_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1263_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1484_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0470_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0232_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0312_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1478_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0176_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0825_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0587_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0726_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1069_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0478_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1470_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0751_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0034_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0557_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0087_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0554_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0680_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0095_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1193_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1290_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0956_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1433_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1409_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0546_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1137_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1042_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1361_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0983_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1099_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0675_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0000_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0540_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0593_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1182_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0984_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0463_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0406_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0963_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1112_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0572_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0894_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0901_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0539_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1462_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0504_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1196_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0916_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0840_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1059_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0895_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1439_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0521_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0227_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0157_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0134_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0091_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0080_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1405_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0856_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0355_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0904_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1256_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0886_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0703_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0870_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1307_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1463_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1278_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1176_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0999_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0352_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1216_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0506_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0942_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0164_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0071_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0362_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0285_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0819_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1447_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0794_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1111_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0452_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1008_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1064_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0786_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0275_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1004_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1020_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1198_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0258_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0179_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0473_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0706_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0829_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1348_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0489_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0721_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1418_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0657_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0182_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1268_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0677_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0211_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1199_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1161_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1159_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1060_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0203_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0634_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0214_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1292_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0168_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1342_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0105_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1346_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0145_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0174_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1393_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1295_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0888_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0673_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1412_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1337_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1089_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1031_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0542_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0135_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0604_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0905_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1028_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0760_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0052_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0361_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0408_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0514_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1144_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0086_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0016_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0363_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0026_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0013_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0048_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0343_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0991_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1151_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0325_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0209_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1325_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0042_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0162_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0902_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0625_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0482_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0502_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0225_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1190_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0498_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0198_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1146_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1197_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0889_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0873_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1013_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1212_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0107_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0005_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0528_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0101_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0792_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0638_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1253_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1449_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0761_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1330_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0268_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0372_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0757_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0031_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0815_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0074_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0771_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0936_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1355_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0764_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0516_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1460_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0394_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0178_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1005_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1103_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0899_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1204_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1061_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0327_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0233_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0057_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0922_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0996_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1457_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0513_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1323_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0426_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1428_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0879_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0297_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1395_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0333_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0202_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1171_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0746_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0111_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1459_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0806_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1332_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0003_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1082_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0745_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1187_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0803_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0326_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1131_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0678_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1458_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1465_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0662_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0715_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0043_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1029_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0108_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0450_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0465_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0125_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1052_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0988_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0487_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0872_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0096_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0865_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0643_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0507_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0032_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1228_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1076_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1229_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0219_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1341_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0384_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0583_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0236_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1267_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0682_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1324_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0672_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1237_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0256_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0689_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0801_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1392_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0630_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0041_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1027_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0173_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1476_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1455_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0172_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0228_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0812_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0980_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0781_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1280_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0141_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1046_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0656_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0137_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0234_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0909_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0410_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0836_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1313_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1326_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1113_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1040_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1345_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1492_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0831_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0293_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1063_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0309_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0353_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0597_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0011_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0189_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1174_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0474_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1079_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0066_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0697_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0734_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0756_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1488_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0238_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0235_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0274_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0417_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0316_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1451_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0864_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0127_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0900_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0244_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0291_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1205_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1344_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1351_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0441_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0213_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1143_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1320_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0437_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0927_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0324_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1125_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1421_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0735_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0573_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0002_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1416_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0776_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1430_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1322_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0180_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0272_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0896_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0543_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0570_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0859_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0255_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0263_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1299_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0835_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0039_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0611_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0369_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0732_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1485_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0045_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1138_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0067_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0642_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0961_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0249_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1221_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0496_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1312_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0599_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0497_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1413_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1382_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0661_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1251_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0560_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0350_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0018_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0354_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0007_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0594_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1003_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1375_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0298_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0199_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0555_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0419_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0400_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0608_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0789_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0618_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0212_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0493_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0633_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0920_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0621_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0129_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1033_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0982_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0300_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1000_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1284_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0241_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0163_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0971_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0455_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1124_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1464_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0332_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1225_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0160_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0132_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0953_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1396_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0319_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0014_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0867_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0765_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0731_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0121_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1340_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1414_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1149_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0897_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1140_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1406_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1491_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0811_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0047_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0907_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0184_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1202_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0595_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1333_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0149_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1234_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1289_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0331_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1415_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0769_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0935_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0216_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0940_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0762_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0445_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0378_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0280_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0376_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1047_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1045_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1056_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0356_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0210_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1386_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1110_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0476_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1163_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0598_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0511_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0279_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1254_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0115_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0365_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1298_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0839_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1227_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1282_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0030_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0254_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0658_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0978_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0851_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0130_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0357_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0152_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0952_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0834_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1436_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1302_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1210_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1445_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1328_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0188_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1152_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0340_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0534_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0986_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0892_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0062_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1173_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0009_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0537_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0058_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0286_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1350_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0837_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0068_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1291_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1108_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0158_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0425_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1007_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0717_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1314_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1097_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0131_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1425_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0050_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1432_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0257_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1487_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0793_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0655_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0339_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1109_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0151_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0830_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0912_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0700_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0959_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1156_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0798_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1371_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0023_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1483_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1357_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0122_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0462_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1021_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0562_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0505_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0787_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1420_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1399_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1024_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1296_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0454_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0844_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0133_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1377_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1037_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0705_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0623_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0950_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0547_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0240_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1011_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0893_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0117_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1178_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1120_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0346_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1354_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0880_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1019_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0477_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1231_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0660_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0471_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1168_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1214_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1378_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1142_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1102_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1015_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1438_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1372_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0945_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1248_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1119_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0064_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0973_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0509_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1135_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0019_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0663_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1025_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0261_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1431_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1403_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0004_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1374_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1164_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0740_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1217_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0387_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0911_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1203_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0995_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0535_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1319_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0795_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1184_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0915_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0576_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0329_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0929_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0142_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0413_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0520_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1080_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1334_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1207_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0869_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1012_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0475_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0167_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0494_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0264_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1358_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0641_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1318_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1194_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0059_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0578_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1095_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0692_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1422_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0195_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0431_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0928_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0388_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0690_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0467_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0415_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0119_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0187_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0424_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0716_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0854_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0252_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0635_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0755_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0294_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0719_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0785_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0914_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1364_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0021_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1242_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0220_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0139_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1220_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1383_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0102_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1424_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0921_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1215_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0722_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0908_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0805_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0414_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0190_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0089_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1150_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1360_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0453_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0827_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1273_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0670_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0891_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1329_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1261_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1147_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1262_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0359_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0328_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1240_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1391_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0156_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0989_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1219_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0923_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0654_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0698_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0065_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1287_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0383_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0532_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0260_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1349_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1086_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0788_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0113_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1363_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0687_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1083_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0918_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0585_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0392_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0253_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1043_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0449_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0568_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0421_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0747_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1175_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1373_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1304_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1474_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1482_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1385_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0317_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0603_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0519_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1090_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0020_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1489_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0871_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0510_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0104_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1018_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0245_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1440_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0758_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1247_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1010_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1179_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0552_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0276_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0679_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0571_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1017_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0301_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1477_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1450_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0712_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0676_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0577_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0684_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0881_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0644_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0076_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1446_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0154_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0284_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0824_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0231_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0701_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0790_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1224_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1454_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0405_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0177_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0267_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0944_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1368_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0389_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0610_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0128_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1085_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1057_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0459_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0027_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0759_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0458_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0472_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0816_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0022_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0832_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0097_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0407_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0820_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0777_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0941_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0821_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0796_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1022_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0967_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0548_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1117_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0197_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0175_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0218_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0397_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0850_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0710_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0592_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0447_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0607_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1249_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0436_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0784_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0728_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0646_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1104_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0443_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1154_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0194_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1206_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1233_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0170_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0299_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0605_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0033_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0037_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0631_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0877_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0565_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0626_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0723_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0650_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0590_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0938_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1132_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1479_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0313_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0651_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0926_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1274_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0828_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0602_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0637_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0423_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0681_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0411_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0223_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1186_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0591_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1441_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0533_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1366_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1466_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1423_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1075_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0939_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1327_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0647_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0420_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0330_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1016_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0600_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0606_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1096_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0246_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0530_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0344_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1471_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0774_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0733_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0283_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0561_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0466_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1032_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1188_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1285_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0029_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1081_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0063_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0739_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0954_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0448_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1331_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1218_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0688_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1169_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0799_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0866_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0853_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0955_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0910_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0490_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0861_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0948_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1336_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0349_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0808_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1277_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0601_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0044_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0315_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1230_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1255_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0271_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1051_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0481_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0970_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0412_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0040_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1068_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1091_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1394_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0862_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1461_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1453_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0070_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0852_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0012_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0484_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0140_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0919_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0288_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1490_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0863_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1129_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0737_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0791_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0884_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0708_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0667_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1452_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1048_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0609_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1049_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0336_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0800_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0196_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0931_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1072_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0207_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0144_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0395_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0648_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1389_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0582_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0433_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0553_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1088_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0391_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0304_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0110_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1310_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0398_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0446_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0574_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0772_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0686_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1335_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1417_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1388_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0855_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0457_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1093_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0903_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0669_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0860_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0754_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0289_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0946_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1275_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0024_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0310_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1153_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0958_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0385_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1115_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0550_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0653_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1243_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1030_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0775_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0382_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1369_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1429_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1400_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1213_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0709_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0842_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0691_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1066_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0766_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1139_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0499_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0699_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0200_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0239_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0439_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1155_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0525_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0523_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0079_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0416_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1481_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0727_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0307_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0138_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1427_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1475_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0429_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0206_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0375_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1448_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1352_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0270_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0103_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0778_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1408_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1223_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1472_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0813_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1384_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0629_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0337_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0848_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0124_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0753_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1410_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0243_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0714_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1001_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1317_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0112_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1260_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0251_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1272_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0624_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1158_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0444_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0925_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1050_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1134_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0695_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0508_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0615_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0265_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1316_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1148_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1315_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0226_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0320_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0237_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1353_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0287_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0262_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1067_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0430_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0001_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0360_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0017_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1468_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0501_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1141_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0998_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1145_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1473_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0396_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0913_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0380_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0544_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1411_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0106_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0442_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0418_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0491_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0368_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0296_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0483_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0536_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1306_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1347_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1467_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0969_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1036_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0010_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1044_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0619_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1437_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0282_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1162_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0750_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0126_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0192_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1362_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1435_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1359_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0685_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1469_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1338_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0273_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0060_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0229_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0358_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0743_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1294_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1136_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0155_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1122_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0707_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1301_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0814_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0083_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0303_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1208_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1116_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0512_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0205_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0847_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0464_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1365_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1266_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0278_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0075_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0201_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1308_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1235_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0056_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0061_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0917_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0401_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0617_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1236_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0974_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0693_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1245_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0524_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0541_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0351_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0099_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0614_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0403_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0321_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0370_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1379_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0084_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0666_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0783_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0028_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0208_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1244_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1300_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0883_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1239_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0308_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0035_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0486_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1271_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0078_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1127_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0377_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0652_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1443_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0711_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0838_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0322_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1222_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0290_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0857_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1062_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0186_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1114_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1073_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0802_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1123_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0281_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0770_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0620_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0581_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0456_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0094_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0092_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0645_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1426_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0334_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0809_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0422_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1170_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0153_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0147_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0100_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0636_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1026_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0639_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0081_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0720_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1094_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0381_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0342_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1387_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0347_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0191_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0409_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1118_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0159_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0485_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1009_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1165_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0116_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0586_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1381_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0668_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0992_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0073_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0664_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1264_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0183_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0822_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0338_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1200_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0479_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0972_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0826_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0780_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0432_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0373_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0767_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0469_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0779_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0217_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1177_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0364_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1014_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0461_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1209_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0976_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0804_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0549_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1180_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0120_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1401_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0559_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0933_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1157_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0114_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0480_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0964_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1023_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0580_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1053_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0266_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0438_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0875_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1286_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0123_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0729_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0937_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0295_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0277_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0069_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1006_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0994_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1100_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0960_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1407_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0741_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0374_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1065_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0730_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0306_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1074_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0222_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0008_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0763_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0563_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1172_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0632_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1092_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0906_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0613_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0890_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1070_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0136_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0947_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1238_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0882_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0495_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1128_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1279_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0575_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1201_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0404_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0773_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0898_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1321_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0846_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0518_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1367_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0345_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0015_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0451_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1297_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1241_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0987_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0150_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0696_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0807_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1035_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1434_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0649_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1404_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1281_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0768_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0738_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1309_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1133_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1058_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0427_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0468_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0500_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0148_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0386_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0616_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0428_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0118_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0569_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0323_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0718_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1252_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1191_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0143_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0314_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0671_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1486_text_document +0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0596_text_document diff --git a/ALCF/data-lists/polaris/data_file_list_gutenberg-books.txt b/ALCF/data-lists/polaris/data_file_list_gutenberg-books.txt new file mode 100644 index 0000000000..18109f946d --- /dev/null +++ b/ALCF/data-lists/polaris/data_file_list_gutenberg-books.txt @@ -0,0 +1,3 @@ +0.006 /eagle/datasets//dolma/data_Llama2Tokenizer/gutenberg-books//books-0001_text_document +0.006 /eagle/datasets//dolma/data_Llama2Tokenizer/gutenberg-books//books-0000_text_document +0.006 /eagle/datasets//dolma/data_Llama2Tokenizer/gutenberg-books//books-0002_text_document diff --git a/ALCF/data-lists/polaris/data_file_list_peS2o.txt b/ALCF/data-lists/polaris/data_file_list_peS2o.txt new file mode 100644 index 0000000000..3f2ddfb299 --- /dev/null +++ b/ALCF/data-lists/polaris/data_file_list_peS2o.txt @@ -0,0 +1,42 @@ +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0039_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0014_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0034_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0007_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0020_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0026_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0036_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0030_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0015_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0018_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0033_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0027_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0023_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0024_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0009_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0025_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0010_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0032_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0029_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0021_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0040_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0000_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0013_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0005_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0022_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0011_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0038_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0003_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0019_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0031_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0012_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0041_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0004_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0001_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0037_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0006_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0016_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0002_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0017_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0028_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0035_text_document +0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0008_text_document diff --git a/ALCF/data-lists/polaris/data_file_list_stack-code.txt b/ALCF/data-lists/polaris/data_file_list_stack-code.txt new file mode 100644 index 0000000000..f5049cd9e4 --- /dev/null +++ b/ALCF/data-lists/polaris/data_file_list_stack-code.txt @@ -0,0 +1,4435 @@ +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+django/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+django/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+django/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rhtml/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl6/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gap/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/alloy/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/squirrel/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/webassembly/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/webassembly/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/webassembly/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rouge/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/textile/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/labview/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mask/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/batchfile/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/batchfile/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/batchfile/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/batchfile/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/batchfile/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/batchfile/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/batchfile/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/openscad/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/openscad/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/qml/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/qml/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/qml/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/qml/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/maxscript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/modelica/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/modelica/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unrealscript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lookml/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/opal/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/module-management-system/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/opa/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ren'py/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ren'py/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/maple/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/numpy/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/realbasic/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pure-data/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pure-data/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ceylon/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0102_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0104_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0105_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0106_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0103_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0107_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0108_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0101_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/augeas/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rdoc/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/logtalk/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c2hs-haskell/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pov-ray-sdl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ioke/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/verilog/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clips/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/chuck/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stylus/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stylus/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stylus/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pod/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pod/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xojo/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/component-pascal/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/component-pascal/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/emacs-lisp/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/emacs-lisp/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/emacs-lisp/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/emacs-lisp/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/emacs-lisp/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/emacs-lisp/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/emacs-lisp/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lex/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lex/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/grace/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/processing/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/processing/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/processing/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/processing/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sas/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sas/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/netlogo/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/autoit/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/autoit/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/zephir/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/robotframework/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/propeller-spin/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xquery/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/txl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nu/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ampl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tea/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/csound/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/brightscript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/slim/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/slim/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/red/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/thrift/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pony/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stata/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stata/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stata/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stata/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stata/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/literate-coffeescript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ats/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/parrot-internal-representation/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lolcode/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/digital-command-language/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/abap/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/abap/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lsl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jade/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jade/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jade/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jade/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hlsl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hlsl/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scaml/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ags-script/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clojure/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clojure/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clojure/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clojure/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clojure/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clojure/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clojure/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clojure/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xs/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mtml/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rmarkdown/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kit/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mako/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/igor-pro/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sourcepawn/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sourcepawn/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/apl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/protocol-buffer/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/protocol-buffer/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/protocol-buffer/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/protocol-buffer/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/protocol-buffer/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/protocol-buffer/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/protocol-buffer/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/protocol-buffer/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nginx/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/piglatin/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/awk/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vcl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gdscript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gdscript/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gdscript/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gdscript/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gdscript/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/applescript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/webidl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/max/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/max/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/max/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/max/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/max/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/max/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/max/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/max/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glyph/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/papyrus/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/boo/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hy/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/d/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xc/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/aspectj/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/isabelle/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/isabelle/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sqf/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sqf/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/volt/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/monkey/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lfe/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clarion/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/oxygene/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/metal/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nsis/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/zig/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/zig/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/zig/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/muf/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dylan/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xbase/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smt/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smt/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smt/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smt/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smt/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/autohotkey/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/autohotkey/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clean/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ston/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/creole/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ecl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elm/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elm/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elm/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elm/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elm/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eiffel/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eiffel/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/darcs-patch/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+php/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+php/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+php/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+php/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+php/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+php/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cython/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cython/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cython/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cython/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cython/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dm/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/opencl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/opencl/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/opencl/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/parrot-assembly/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cobol/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/io/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/saltstack/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ox/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/matlab/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/renderscript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/purescript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/purescript/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dogescript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/omgrofl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/supercollider/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/flux/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/viml/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/viml/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/viml/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/viml/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/viml/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/viml/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/literate-haskell/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tcsh/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+eex/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/golo/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pawn/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/oz/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/idl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/literate-agda/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/click/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/freemarker/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/freemarker/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/freemarker/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/freemarker/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/freemarker/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/krl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/inform-7/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stan/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/livescript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coldfusion-cfc/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coldfusion-cfc/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gosu/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/parrot/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lilypond/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/moonscript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/idris/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mediawiki/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mediawiki/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mediawiki/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mediawiki/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mediawiki/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/logos/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/logos/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/logos/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/logos/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fish/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asp/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asp/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asp/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asp/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asp/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asp/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asp/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/systemverilog/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/systemverilog/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/systemverilog/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/systemverilog/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/systemverilog/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/systemverilog/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/org/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/org/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/org/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/org/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/org/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/myghty/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arc/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/latte/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turing/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/bison/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/emberscript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xpages/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/linker-script/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/linker-script/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coldfusion/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphql/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphql/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphql/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cycript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scilab/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gentoo-eclass/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy-server-pages/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy-server-pages/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dns-zone/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scheme/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scheme/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scheme/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scheme/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scheme/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scheme/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scheme/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/netlinx/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/irc-log/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mirah/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/purebasic/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cartocss/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/j/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jflex/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/qmake/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/api-blueprint/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pan/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/brainfuck/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ecere-projects/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haml/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haml/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haml/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haml/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fantom/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tcl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tcl/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tcl/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tcl/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tcl/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tcl/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tcl/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/grammatical-framework/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/befunge/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sass/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sass/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sass/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sass/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objdump/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ada/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ada/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ada/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ada/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ada/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ada/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ada/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/urweb/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/blitzmax/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/liquid/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/liquid/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/genshi/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ninja/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ninja/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gams/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lasso/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/desktop/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/agda/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/agda/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-c++/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-c++/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-c++/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-c++/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-c++/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-c++/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-c++/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-c++/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/m4/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/m4/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coq/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gentoo-ebuild/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gentoo-ebuild/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/factor/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/uno/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/apacheconf/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pogoscript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nimrod/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nimrod/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nimrod/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nimrod/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nimrod/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nimrod/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jasmin/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/bluespec/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nit/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ec/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/raml/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rebol/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-j/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/bro/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sparql/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/chapel/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pike/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/harbour/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/crystal/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/crystal/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/crystal/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/crystal/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/crystal/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lean/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lean/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lean/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unified-parallel-c/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xtend/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ooc/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shen/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/self/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/m/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/x10/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cirru/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/redcode/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mupad/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/inno-setup/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/wisp/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cucumber/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cucumber/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cucumber/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cucumber/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/http/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/http/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yacc/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yacc/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yacc/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yacc/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yacc/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yacc/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yacc/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/forth/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yang/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yang/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yang/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/r/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/r/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/r/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/r/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ragel-in-ruby-host/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nesc/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nesc/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/standard-ml/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/standard-ml/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/standard-ml/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/standard-ml/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/standard-ml/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/standard-ml/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/slash/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/zimpl/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sage/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/blitzbasic/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/octave/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fancy/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/antlr/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/bitbake/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/bitbake/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vala/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cap'n-proto/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/prolog/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shellsession/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python-traceback/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0087_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0025_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0019_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0015_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0009_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0064_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0050_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0058_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0093_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0059_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0051_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0021_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0071_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0008_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0044_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0072_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0097_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0068_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0080_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0057_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0047_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0052_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0067_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0014_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0023_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0034_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0032_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0007_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0060_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0086_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0094_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0091_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0017_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0090_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0066_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0046_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0096_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0100_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0020_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0076_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0092_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0048_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0049_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0075_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0026_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0012_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0095_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0074_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0083_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0054_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0070_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0082_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0055_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0024_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0037_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0088_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0078_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0065_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0062_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0089_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0099_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0061_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0041_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0079_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0035_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0002_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0028_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0045_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0040_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0004_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0098_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0038_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0056_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0063_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0011_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0003_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0077_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0042_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0013_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0029_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0039_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0022_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0084_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0085_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0069_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0081_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0006_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0073_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0027_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0010_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0043_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0005_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0016_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0033_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0018_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0036_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0030_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0001_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0053_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0031_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xproc/v3-0000_text_document +0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/racket/v3-0000_text_document diff --git a/ALCF/data-lists/polaris/data_file_list_wiki-en-simple.txt b/ALCF/data-lists/polaris/data_file_list_wiki-en-simple.txt new file mode 100644 index 0000000000..134c1473b1 --- /dev/null +++ b/ALCF/data-lists/polaris/data_file_list_wiki-en-simple.txt @@ -0,0 +1,2 @@ +0.0045 /eagle/datasets//dolma/data_Llama2Tokenizer/wiki-en-simple//en_simple_wiki-0000_text_document +0.0045 /eagle/datasets//dolma/data_Llama2Tokenizer/wiki-en-simple//en_simple_wiki-0001_text_document From 27f66fd87053c0f3764e4dedc9afe8f3fd65c963 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 19 Apr 2024 11:35:52 -0500 Subject: [PATCH 187/268] Add support for DeepSpeed `FusedLamb` optimizer --- megatron/arguments.py | 7 +- megatron/optimizer/__init__.py | 146 +++++++++++++++++++++++++++++++-- 2 files changed, 144 insertions(+), 9 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index b982337f51..70dfa88ef6 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -923,10 +923,15 @@ def _add_training_args(parser): 'adam', 'adamw', 'sgd', + 'ds.fusedlamb', 'apex.adam', 'apex.sgd', 'adamwschedulefree', - 'sgdschedulefree' + 'sgdschedulefree', + 'galoreadamw', + 'adam8bit', + 'galoreadamw8bit', + 'galoreadamw8bitperlayer' ], help='Optimizer function' ) diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index 6dfd0ea40f..80adeb8521 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -3,6 +3,7 @@ from deepspeed.accelerator import get_accelerator import torch +from typing import Callable, Any from megatron import get_args from .distrib_optimizer import DistributedOptimizer @@ -10,19 +11,60 @@ from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer -def get_param_groups(modules, - no_weight_decay_cond, - scale_lr_cond, - lr_mult): - """creates param groups based on weight decay condition (regularized vs non regularized) - and learning rate scale condition (args.lr vs lr_mult * args.lr) - scale_lr_cond is used during finetuning where head of the network requires a scaled - version of the base learning rate. +import ezpz as ez +RANK = ez.get_rank() + + +def get_param_groups( + modules: torch.nn.Module | iter[torch.nn.Module], + no_weight_decay_cond: Callable[[str, torch.Tensor], bool], + scale_lr_cond: Callable[[str, torch.Tensor], bool], + lr_mult: Any, + use_galore: bool = False, +): + """ + Creates param groups (regularized vs non) based on: + + - weight decay condition. + - learning rate scale condition (args.lr vs lr_mult * args.lr) + - scale_lr_cond is used during finetuning, where head of the network + requires a scaled version of the base learning rate. + # if 'galore' in args.optimizer.lower(): + # # make parameters with "rank" to a single group, if param_name has "mlp" or "attn" + # galore_params = [] + # target_modules_list = ["attn", "mlp"] + # # for module_name, module in param_groups: + # for group_id, group in enumerate(param_groups): + # for param, p in enumerate(group['params']): + # if not isinstance(module, torch.nn.Linear): + # continue + # if not any(target_key in module_name for target_key in target_modules_list): + # continue + # print('enable GaLore for weights in module: ', module_name) + # galore_params.append(module.weight) + # id_galore_params = [id(p) for p in galore_params] + # # make parameters without "rank" to another group + # regular_params = [p for p in param_groups if id(p) not in id_galore_params] + # # then call galore_adamw + # param_groups = [ + # { + # 'params': regular_params + # }, + # { + # 'params': galore_params, + # 'rank': RANK, + # 'update_proj_gap': args.update_proj_gap, + # 'scale': args.galore_scale, + # 'proj_type': args.proj_type + # } + # ] """ wd_no_scale_lr = [] wd_scale_lr = [] no_wd_no_scale_lr = [] no_wd_scale_lr = [] + galore_params = [] + target_modules_list = ["attn", "mlp"] for module in modules: for name, param in module.named_parameters(): if not param.requires_grad: @@ -83,6 +125,7 @@ def get_megatron_optimizer( param_groups = split_params_into_different_moe_groups_for_optimizer( param_groups ) + if args.cpu_optimizer: assert args.optimizer == 'adam', 'CPU offloading is for Adam' if args.cpu_torch_adam: @@ -97,6 +140,93 @@ def get_megatron_optimizer( betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps, ) + + elif args.optimizer.lower() == "galore_adamw": + from galore_torch import GaLoreAdamW, GaLoreAdamW8bit + # redefine way to call galore_adamw + optimizer = GaLoreAdamW(param_groups, lr=args.lr, weight_decay=args.weight_decay) + elif args.optimizer.lower() == "galore_adamw": + # redefine way to call galore_adamw + optimizer = GaLoreAdamW(param_groups, lr=args.lr, weight_decay=args.weight_decay) + # implement adafactor + elif args.optimizer.lower() == "adafactor": + import transformers + args.beta1 = None if args.beta1 == 0.0 else args.beta1 + optimizer = transformers.optimization.Adafactor( + param_groups, + lr=args.lr, + eps=(1e-30, 1e-3), + clip_threshold=1.0, + decay_rate=-0.8, + beta1=args.beta1, + weight_decay=args.weight_decay, + relative_step=False, + scale_parameter=False, + warmup_init=False, + ) + # low-rank adafactor + elif args.optimizer.lower() == "galore_adafactor": + args.beta1 = None if args.beta1 == 0.0 else args.beta1 + optimizer = GaLoreAdafactor( + param_groups, + lr=args.lr, + eps=(1e-30, 1e-3), + clip_threshold=1.0, + decay_rate=-0.8, + beta1=args.beta1, + weight_decay=args.weight_decay, + relative_step=False, + scale_parameter=False, + warmup_init=False, + ) + # 8-bit Adam + elif args.optimizer.lower() == "adam8bit": + import bitsandbytes as bnb + optimizer = bnb.optim.Adam8bit(param_groups, lr=args.lr, weight_decay=args.weight_decay) + elif args.optimizer.lower() == "galore_adamw8bit": + optimizer = GaLoreAdamW8bit(param_groups, lr=args.lr, weight_decay=args.weight_decay) + elif args.optimizer.lower() == 'galore_adamw8bit_per_layer': + # TODO: seems scheduler call twice in one update step, need to check, for now double the num_training_steps, warmup_steps and update_proj_gap + optimizer_dict = {} + for p in model.parameters(): + if p.requires_grad: + if id(p) in id_galore_params: + optimizer_dict[p] = GaLoreAdamW8bit([{'params': [p], 'rank': args.rank, 'update_proj_gap': args.update_proj_gap * 2, 'scale': args.galore_scale, 'proj_type': args.proj_type}], lr=args.lr, weight_decay=args.weight_decay) + else: + optimizer_dict[p] = bnb.optim.Adam8bit([p], lr=args.lr, weight_decay=args.weight_decay) + # get scheduler dict + scheduler_dict = {} + from galore_torch.peft_pretraining import training_utils + for p in model.parameters(): + if p.requires_grad: + scheduler_dict[p] = training_utils.get_scheculer( + optimizer=optimizer_dict[p], + scheduler_type=args.scheduler, + num_training_steps=args.num_training_steps * 2, + warmup_steps=args.warmup_steps * 2, + min_lr_ratio=args.min_lr_ratio, + ) + + def optimizer_hook(p): + if p.grad is None: + return + optimizer_dict[p].step() + optimizer_dict[p].zero_grad() + scheduler_dict[p].step() + # Register the hook onto every parameter + for p in model.parameters(): + if p.requires_grad: + p.register_post_accumulate_grad_hook(optimizer_hook) + layer_wise_flag = True + elif str(args.optimizer).lower() == 'ds.fusedlamb': + from deepspeed.ops.lamb import FusedLamb + optimizer = FusedLamb( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + ) elif str(args.optimizer).lower() == 'adamwschedulefree': import schedulefree optimizer = schedulefree.AdamWScheduleFree( From fc1b347ca8e6c246309e43fa1a8600ee81b4de68 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 19 Apr 2024 12:20:58 -0500 Subject: [PATCH 188/268] Add `ALCF/data-lists/sunspot/*.txt` --- ...erg-books.txt => data_file_list_books.txt} | 0 ...tack-code.txt => data_file_list_stack.txt} | 0 ...-en-simple.txt => data_file_list_wiki.txt} | 0 .../sunspot/data_file_list_books.txt | 3 + ALCF/data-lists/sunspot/data_file_list_c4.txt | 86 + ALCF/data-lists/sunspot/data_file_list_cc.txt | 2880 +++++++++++++++++ .../sunspot/data_file_list_peS2o.txt | 26 + .../sunspot/data_file_list_reddit.txt | 78 + .../sunspot/data_file_list_stack.txt | 149 + .../sunspot/data_file_list_wiki.txt | 2 + 10 files changed, 3224 insertions(+) rename ALCF/data-lists/polaris/{data_file_list_gutenberg-books.txt => data_file_list_books.txt} (100%) rename ALCF/data-lists/polaris/{data_file_list_stack-code.txt => data_file_list_stack.txt} (100%) rename ALCF/data-lists/polaris/{data_file_list_wiki-en-simple.txt => data_file_list_wiki.txt} (100%) create mode 100644 ALCF/data-lists/sunspot/data_file_list_books.txt create mode 100644 ALCF/data-lists/sunspot/data_file_list_c4.txt create mode 100644 ALCF/data-lists/sunspot/data_file_list_cc.txt create mode 100644 ALCF/data-lists/sunspot/data_file_list_peS2o.txt create mode 100644 ALCF/data-lists/sunspot/data_file_list_reddit.txt create mode 100644 ALCF/data-lists/sunspot/data_file_list_stack.txt create mode 100644 ALCF/data-lists/sunspot/data_file_list_wiki.txt diff --git a/ALCF/data-lists/polaris/data_file_list_gutenberg-books.txt b/ALCF/data-lists/polaris/data_file_list_books.txt similarity index 100% rename from ALCF/data-lists/polaris/data_file_list_gutenberg-books.txt rename to ALCF/data-lists/polaris/data_file_list_books.txt diff --git a/ALCF/data-lists/polaris/data_file_list_stack-code.txt b/ALCF/data-lists/polaris/data_file_list_stack.txt similarity index 100% rename from ALCF/data-lists/polaris/data_file_list_stack-code.txt rename to ALCF/data-lists/polaris/data_file_list_stack.txt diff --git a/ALCF/data-lists/polaris/data_file_list_wiki-en-simple.txt b/ALCF/data-lists/polaris/data_file_list_wiki.txt similarity index 100% rename from ALCF/data-lists/polaris/data_file_list_wiki-en-simple.txt rename to ALCF/data-lists/polaris/data_file_list_wiki.txt diff --git a/ALCF/data-lists/sunspot/data_file_list_books.txt b/ALCF/data-lists/sunspot/data_file_list_books.txt new file mode 100644 index 0000000000..9187565a5e --- /dev/null +++ b/ALCF/data-lists/sunspot/data_file_list_books.txt @@ -0,0 +1,3 @@ +0.0031007020167215667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/books-0000_text_document +0.003100207465277759 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/books-0001_text_document +0.000999090518000674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/books-0002_text_document diff --git a/ALCF/data-lists/sunspot/data_file_list_c4.txt b/ALCF/data-lists/sunspot/data_file_list_c4.txt new file mode 100644 index 0000000000..ca7df1839e --- /dev/null +++ b/ALCF/data-lists/sunspot/data_file_list_c4.txt @@ -0,0 +1,86 @@ +0.0011545953050729803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0000_text_document +0.0011570295715413383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0001_text_document +0.001156438391210766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0002_text_document +0.0011556820995190797 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0003_text_document +0.001156780334924253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0004_text_document +0.0011563528368937514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0005_text_document +0.0011574632716369762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0006_text_document +0.0011577445131424494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0007_text_document +0.0011599182963630329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0008_text_document +0.0011550792360663698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0009_text_document +0.001154948574643344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0010_text_document +0.0011560157369398198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0011_text_document +0.0011551344387810997 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0012_text_document +0.0011586914190552 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0013_text_document +0.00115559584811127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0014_text_document +0.0011562917764239204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0015_text_document +0.0011582019252872318 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0016_text_document +0.0011585605528399534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0017_text_document +0.0011567600261132287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0018_text_document +0.0011561323235067436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0019_text_document +0.0011568948157687324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0020_text_document +0.0011562184926986983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0021_text_document +0.001155171968076667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0022_text_document +0.001156245876059478 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0023_text_document +0.0011591826911770261 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0024_text_document +0.0011564400126070828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0025_text_document +0.0011571005158517765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0026_text_document +0.0011560050453907214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0027_text_document +0.0011559074476966407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0028_text_document +0.0011567638698290205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0029_text_document +0.0011558972055942165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0030_text_document +0.001157532269673901 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0031_text_document +0.0011559883017581377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0032_text_document +0.001155556362078353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0033_text_document +0.0011544735837522018 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0034_text_document +0.0011547315955415466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0035_text_document +0.0011570980852521353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0036_text_document +0.0011562552591307868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0037_text_document +0.001156640315842092 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0038_text_document +0.0011587257748187634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0039_text_document +0.0011563083526351268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0040_text_document +0.0011554464046007336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0041_text_document +0.001155442922136426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0042_text_document +0.0011557081619451221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0043_text_document +0.001156421357082161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0044_text_document +0.0011562730825316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0045_text_document +0.001157525507046117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0046_text_document +0.0011552936629887162 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0047_text_document +0.0011578959437852875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0048_text_document +0.0011568910557636293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0049_text_document +0.0011578444955946039 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0050_text_document +0.001157076096248001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0051_text_document +0.0011568459536403974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0052_text_document +0.0011555352450605598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0053_text_document +0.0011557650508322967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0054_text_document +0.0011567625802857914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0055_text_document +0.0011568533734967437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0056_text_document +0.0011562185375437102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0057_text_document +0.0011558740426473278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0058_text_document +0.0011549825990520978 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0059_text_document +0.0011572314079774744 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0060_text_document +0.0011576031815962752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0061_text_document +0.0011567937670018521 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0062_text_document +0.001154956951193276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0063_text_document +0.001157226898064118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0064_text_document +0.001156096958730414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0065_text_document +0.001155844223704128 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0066_text_document +0.0011571187084765205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0067_text_document +0.0011573954893981501 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0068_text_document +0.0011566700251641518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0069_text_document +0.0011550051959552815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0070_text_document +0.0011559629359246125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0071_text_document +0.001157971629210032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0072_text_document +0.0011561725903411443 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0073_text_document +0.001157160385935682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0074_text_document +0.0011568864860569239 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0075_text_document +0.0011576433208715313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0076_text_document +0.0011571382379808948 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0077_text_document +0.0011590178523739284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0078_text_document +0.001156347684201892 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0079_text_document +0.0011552550374817486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0080_text_document +0.0011570794132840427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0081_text_document +0.0011570932061148482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0082_text_document +0.0011561938025300182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0083_text_document +0.0011560757016965283 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0084_text_document +0.00019284851714729888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0085_text_document diff --git a/ALCF/data-lists/sunspot/data_file_list_cc.txt b/ALCF/data-lists/sunspot/data_file_list_cc.txt new file mode 100644 index 0000000000..4b9a797878 --- /dev/null +++ b/ALCF/data-lists/sunspot/data_file_list_cc.txt @@ -0,0 +1,2880 @@ +0.0002329030984435853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0000_text_document +0.00023018699207949078 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0001_text_document +0.00024373839803694205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0002_text_document +0.00023608269234913788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0003_text_document +0.00024813091225197464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0004_text_document +0.00023520818074126314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0005_text_document +0.0002374607329273171 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0006_text_document +0.00023738412849923294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0007_text_document +0.0002443634316582533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0008_text_document +0.00023847622533166118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0009_text_document +0.00023199871587697545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0010_text_document +0.0002385337709567312 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0011_text_document +0.0002432839071745339 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0012_text_document +0.00023508523674007346 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0013_text_document +0.00032603226617680567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0014_text_document +0.00023789141182395846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0015_text_document +0.0002461407443245122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0016_text_document +0.00023499257215518966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0017_text_document +0.00024846537508068473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0018_text_document +0.0002386611981191132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0019_text_document +0.0002476214516386151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0020_text_document +0.00023922963334203518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0021_text_document +0.0002566637890877035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0022_text_document +0.0002480836116312675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0023_text_document +0.0002500957846859012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0024_text_document +0.00023232303192858133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0025_text_document +0.0002402109920207785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0026_text_document +0.00032458741378655037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0027_text_document +0.00023711130623699136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0028_text_document +0.0002473092752915358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0029_text_document +0.00024517111812673547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0030_text_document +0.00024145261714879915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0031_text_document +0.0002441832095655324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0032_text_document +0.00024533720808111173 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0033_text_document +0.00024615543201451354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0034_text_document +0.00029788578618284437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0035_text_document +0.00026821245945822444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0036_text_document +0.0002451138188102186 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0037_text_document +0.00023812823651070536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0038_text_document +0.00023799603175215714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0039_text_document +0.00024128396884325748 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0040_text_document +0.00024158008848876737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0041_text_document +0.00024722330373436316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0042_text_document +0.00023308404070500205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0043_text_document +0.0002554252556503107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0044_text_document +0.0003132025339147037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0045_text_document +0.00024278622445373792 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0046_text_document +0.0003214585004572529 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0047_text_document +0.0003329131703028111 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0048_text_document +0.0002361664236831262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0049_text_document +0.0002643368247294079 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0050_text_document +0.00024766538637149724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0051_text_document +0.0002627167479901225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0052_text_document +0.00025033496855447236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0053_text_document +0.00024160037266449382 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0054_text_document +0.00022926708072112655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0055_text_document +0.00023577632399723273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0056_text_document +0.00024916378421745264 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0057_text_document +0.00024065956580145883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0058_text_document +0.00032914757231594763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0059_text_document +0.000382735213415281 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0060_text_document +0.00019876415914729903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0061_text_document +0.0002455041228482986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0062_text_document +0.0002360975192355561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0063_text_document +0.00035687225557611647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0064_text_document +0.00034010734287544296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0065_text_document +0.00024289772720050695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0066_text_document +0.0002298464162081398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0067_text_document +0.00032731880189343956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0068_text_document +0.00024593154050122983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0069_text_document +0.00024184757636917526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0070_text_document +0.0002619883069796127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0071_text_document +0.00023707630401459816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0072_text_document +0.0003648802259322563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0073_text_document +0.00034821518419266554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0074_text_document +0.00025687739808269634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0075_text_document +0.00025210376457187776 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0076_text_document +0.00025341417049958763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0077_text_document +0.00026096750660126574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0078_text_document +0.0002557323323244081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0079_text_document +0.0003306928457892949 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0080_text_document +0.00034038835131844906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0081_text_document +0.00025944099107910257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0082_text_document +0.00011523229485833962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0083_text_document +0.0002577986281049885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0084_text_document +0.00011763411767853355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0085_text_document +0.00025348268598695737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0086_text_document +0.00032333206004171266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0087_text_document +0.00030755087408648437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0088_text_document +0.00023006508933660387 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0089_text_document +0.00023529378653763827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0090_text_document +0.0002316006671871909 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0091_text_document +0.0002467080329046101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0092_text_document +0.0002812385280195195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0093_text_document +0.0002999655363830447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0094_text_document +0.00030366253916544147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0095_text_document +0.00034483134052353947 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0096_text_document +0.0002264669007084511 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0097_text_document +0.0002601377797129039 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0098_text_document +0.000243683175313779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0099_text_document +0.0002458323373867855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0100_text_document +0.00023061358738763293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0101_text_document +0.0002383240957413279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0102_text_document +0.00024652411741760106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0103_text_document +0.00024356064371899462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0104_text_document +0.00023826916720633669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0105_text_document +0.00023583636824734604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0106_text_document +0.00023310828235332517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0107_text_document +0.00024133699058477928 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0108_text_document +0.00023757818755491814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0109_text_document +0.00024650642737935284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0110_text_document +0.00023587507176169633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0111_text_document +0.0002394516652010616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0112_text_document +0.00026115753562452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0113_text_document +0.00023919185015293048 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0114_text_document +0.0002328737948830104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0115_text_document +0.0002449581587150213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0116_text_document +0.00023488566807302266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0117_text_document +0.0002461692650286432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0118_text_document +0.00023193321359714746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0119_text_document +0.00024814319189332457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0120_text_document +0.0002502054369100928 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0121_text_document +0.0002294119999864264 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0122_text_document +0.00023986985689573848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0123_text_document +0.00023333209217509475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0124_text_document +0.0002268247786450586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0125_text_document +0.0002289098412617007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0126_text_document +0.00023635954118858026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0127_text_document +0.00024647215050850076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0128_text_document +0.00024326708810109974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0129_text_document +0.0002931046025004214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0130_text_document +0.00022529330733557138 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0131_text_document +0.00024288319647667783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0132_text_document +0.0003170441859608398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0133_text_document +0.00032183678547706126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0134_text_document +0.00020557308761968548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0135_text_document +0.00020890924417592562 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0136_text_document +0.00021111297420597103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0137_text_document +0.00021993650550023244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0138_text_document +0.0002123163519100286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0139_text_document +0.0002103629651549111 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0140_text_document +0.00021370932994199264 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0141_text_document +0.00020399994203827728 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0142_text_document +0.00021563034464531022 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0143_text_document +0.0002119386189866467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0144_text_document +0.00020333697838057754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0145_text_document +0.00020812225502998168 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0146_text_document +0.0002192034455873437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0147_text_document +0.0002146433860256116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0148_text_document +0.00022498320338620924 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0149_text_document +0.00020605974297327904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0150_text_document +0.00020911517614300505 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0151_text_document +0.00022086517759478398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0152_text_document +0.00021332423639106333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0153_text_document +0.00020576019154376813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0154_text_document +0.00020504347709097317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0155_text_document +0.00020777754226086552 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0156_text_document +0.00021294564928541406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0157_text_document +0.00020775275197134613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0158_text_document +0.00021002644029417448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0159_text_document +0.00021013797882725636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0160_text_document +0.00019076903434985646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0161_text_document +0.00019137766426576477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0162_text_document +0.0001841037351078922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0163_text_document +0.0001952863228508793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0164_text_document +0.00018602295481911772 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0165_text_document +0.0001931370361427833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0166_text_document +0.0001801085437374987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0167_text_document +0.000188289716886196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0168_text_document +0.0001852865203803285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0169_text_document +0.00018892492640726607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0170_text_document +0.0001867706345514145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0171_text_document +0.00018688900901065678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0172_text_document +0.00018978617486719294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0173_text_document +0.00019074400515584856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0174_text_document +0.00018895644551080948 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0175_text_document +0.0002014139475504348 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0176_text_document +0.00019178652165604014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0177_text_document +0.00019538713758341256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0178_text_document +0.00019221603071045457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0179_text_document +0.00018559646736351844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0180_text_document +0.00018839424919962872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0181_text_document +0.00031369302654824313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0182_text_document +0.00029283955302533026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0183_text_document +0.0003003216050130351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0184_text_document +0.00030560088357585723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0185_text_document +0.00030852297965873606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0186_text_document +0.00030137151200383515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0187_text_document +0.000287675564141583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0188_text_document +0.0002865118305148982 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0189_text_document +0.0002942394807592494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0190_text_document +0.0002892999122858095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0191_text_document +0.00029726222843843534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0192_text_document +0.0002865106197035132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0193_text_document +0.0002831383377282607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0194_text_document +0.00029911101649033976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0195_text_document +0.0002879193266837814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0196_text_document +0.000293888834619463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0197_text_document +0.00028471984768159116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0198_text_document +0.0002880090219919074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0199_text_document +0.0002916398711835823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0200_text_document +0.00029790830243728387 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0201_text_document +0.00028328873748227157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0202_text_document +0.000295084201372288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0203_text_document +0.0002870500420988019 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0204_text_document +0.00028061238206088403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0205_text_document +0.00028268741759946835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0206_text_document +0.0002832900433124873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0207_text_document +0.0002821269671667503 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0208_text_document +0.00028388007298379026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0209_text_document +0.0002811354392519064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0210_text_document +0.0002811576793347316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0211_text_document +0.000291266961761568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0212_text_document +0.0002930917058536775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0213_text_document +0.00029247722771384336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0214_text_document +0.00030253733431717943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0215_text_document +0.0002988938219536017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0216_text_document +0.0003002888817617649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0217_text_document +0.00028686614758997625 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0218_text_document +0.00032046548753382687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0219_text_document +0.00027752519729998216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0220_text_document +0.00026529350985605245 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0221_text_document +0.0002654493836819182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0222_text_document +0.00026232091015406547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0223_text_document +0.0002599081762104853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0224_text_document +0.0002835817651903514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0225_text_document +0.00026294839748242733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0226_text_document +0.0002610835823452124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0227_text_document +0.000260110886669002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0228_text_document +0.000253371820236557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0229_text_document +0.0002581811396117453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0230_text_document +0.0002514852630632709 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0231_text_document +0.00025726705673313424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0232_text_document +0.00025592912496079053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0233_text_document +0.00025012268192543976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0234_text_document +0.00024391340520007348 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0235_text_document +0.0002384383639062725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0236_text_document +0.00023975576001149118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0237_text_document +0.0002338016280970284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0238_text_document +0.0002439200883556984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0239_text_document +0.00024142268942556778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0240_text_document +0.0002427966777591219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0241_text_document +0.00024280144153436732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0242_text_document +0.00024065658615901044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0243_text_document +0.00024455143739741974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0244_text_document +0.00023239795390635735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0245_text_document +0.0002582911684560293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0246_text_document +0.00024625861259252923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0247_text_document +0.0002391576312805854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0248_text_document +0.000238078180343909 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0249_text_document +0.00023486425304981024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0250_text_document +0.0002355893518655022 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0251_text_document +0.0002366129403678232 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0252_text_document +0.00023595832035066449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0253_text_document +0.00023327574008525872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0254_text_document +0.00024148789011315923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0255_text_document +0.0002373778500991465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0256_text_document +0.00023955987733466374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0257_text_document +0.000230949882722363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0258_text_document +0.00023691636140836262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0259_text_document +0.0002296963977634624 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0260_text_document +0.0002332661069034444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0261_text_document +0.00023843042502126992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0262_text_document +0.00023511746712743498 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0263_text_document +0.0002347369877896436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0264_text_document +0.0002323753243697275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0265_text_document +0.00026669348300156857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0266_text_document +0.00025799845912273273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0267_text_document +0.00027628560903016796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0268_text_document +0.00026519284616643963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0269_text_document +0.00026441815097637077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0270_text_document +0.0002662131391195505 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0271_text_document +0.00027728803868991606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0272_text_document +0.0002769764618252775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0273_text_document +0.00027646939593325287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0274_text_document +0.0002624622460988396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0275_text_document +0.0002597094641937235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0276_text_document +0.00026414993058715923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0277_text_document +0.00027056496256926013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0278_text_document +0.0002594411680362496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0279_text_document +0.00026263805833060905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0280_text_document +0.0002560343870682032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0281_text_document +0.0002624349038750109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0282_text_document +0.00025919416325410714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0283_text_document +0.0002611522977423299 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0284_text_document +0.00023679129688303509 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0285_text_document +0.0002424050866477902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0286_text_document +0.00022701047777126036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0287_text_document +0.00023885339653333248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0288_text_document +0.00024106734540671208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0289_text_document +0.0002258801520250309 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0290_text_document +0.0003279882524990489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0291_text_document +0.00033565261995537515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0292_text_document +0.0003289323356607256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0293_text_document +0.0003074095430777535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0294_text_document +0.0003207680812935341 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0295_text_document +0.00031455349141131964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0296_text_document +0.0003292847953027658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0297_text_document +0.0003336588045388259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0298_text_document +0.00031509118791912046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0299_text_document +0.0003142598967986839 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0300_text_document +0.00030783273695855995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0301_text_document +0.0003180584048660508 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0302_text_document +0.0003132932087805931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0303_text_document +0.00031883257979717144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0304_text_document +0.00030944547256766847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0305_text_document +0.00030308947812968015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0306_text_document +0.00027546560713402303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0307_text_document +0.0002849896883269672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0308_text_document +0.00028854314233644503 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0309_text_document +0.00028915140229591915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0310_text_document +0.00028785031389006415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0311_text_document +0.00029386612956137296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0312_text_document +0.00027190973100817075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0313_text_document +0.00028482862326451903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0314_text_document +0.00028103519882799385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0315_text_document +0.00027510038584601916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0316_text_document +0.00028413351954904745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0317_text_document +0.0002766838847779375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0318_text_document +0.00026734717208098886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0319_text_document +0.0002798212098651715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0320_text_document +0.0002747771651023886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0321_text_document +0.0002653649112010507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0322_text_document +0.0002631895073950362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0323_text_document +0.00027233897055462913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0324_text_document +0.00026295942114759743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0325_text_document +0.00030523368071333024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0326_text_document +0.00022951852300606208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0327_text_document +0.00022441558532523096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0328_text_document +0.00022508048810748277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0329_text_document +0.00021854625167048365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0330_text_document +0.00032578339433634126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0331_text_document +0.0003234065091465547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0332_text_document +0.00031578848940780525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0333_text_document +0.0003211733834987297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0334_text_document +0.00030598592011548813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0335_text_document +0.00030636342203205056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0336_text_document +0.0003057832116313887 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0337_text_document +0.000314036788141844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0338_text_document +0.00030966829419359915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0339_text_document +0.00030590256959722885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0340_text_document +0.0003098044211320355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0341_text_document +0.00031610551467687426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0342_text_document +0.0003181946275637243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0343_text_document +0.00030594263323826957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0344_text_document +0.0003126680759448145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0345_text_document +0.0002992280964722656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0346_text_document +0.00029925238994904177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0347_text_document +0.0003002679127100512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0348_text_document +0.00029525568123898354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0349_text_document +0.0003024653097967333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0350_text_document +0.0002953978348393056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0351_text_document +0.0003002611325611784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0352_text_document +0.0002957202302765213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0353_text_document +0.00029316969879070013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0354_text_document +0.00029927093466316167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0355_text_document +0.00029673566591636904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0356_text_document +0.0002937689672539696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0357_text_document +0.0002973606684406085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0358_text_document +0.0002964111065178358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0359_text_document +0.0003023024169175062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0360_text_document +0.0003023653161749783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0361_text_document +0.0003041586406248139 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0362_text_document +0.00029561553630767535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0363_text_document +0.00024185982713467274 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0364_text_document +0.00023843085692504566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0365_text_document +0.00024640440430345615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0366_text_document +0.0002514283272863322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0367_text_document +0.0002428429062712565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0368_text_document +0.00023806417358106035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0369_text_document +0.000241345504518809 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0370_text_document +0.00023475737093303525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0371_text_document +0.00024315922889458298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0372_text_document +0.0002509834540572025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0373_text_document +0.00025303820591366467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0374_text_document +0.00023678822937901864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0375_text_document +0.00023171129872234371 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0376_text_document +0.00024461347186013167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0377_text_document +0.00023799008209254456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0378_text_document +0.00023090419051131675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0379_text_document +0.0002236725770641727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0380_text_document +0.00023567214707890686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0381_text_document +0.0002262722125540663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0382_text_document +0.00034312492202384507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0383_text_document +0.00021814471912144287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0384_text_document +0.00023259303719099642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0385_text_document +0.00031953022508126173 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0386_text_document +0.00023554778297810253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0387_text_document +0.0002460294175476815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0388_text_document +0.0002407153820838108 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0389_text_document +0.0002374237316074476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0390_text_document +0.00023980889380119253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0391_text_document +0.0002511495625217406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0392_text_document +0.0002455758117178104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0393_text_document +0.00024203242698955926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0394_text_document +0.00024139601603558614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0395_text_document +0.00024286894291167163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0396_text_document +0.00023208951019510916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0397_text_document +0.0002357404012027918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0398_text_document +0.00023446333528494393 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0399_text_document +0.0002366761658977476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0400_text_document +0.0002382598783135322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0401_text_document +0.00023065268726624828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0402_text_document +0.00022821836479753894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0403_text_document +0.00023184541693801962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0404_text_document +0.00023323789396160382 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0405_text_document +0.00022765013762883577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0406_text_document +0.00023714308028716352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0407_text_document +0.00028689301916209046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0408_text_document +0.0003409253474017267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0409_text_document +0.0003375051344730567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0410_text_document +0.0003292176313040109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0411_text_document +0.00032955022485317955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0412_text_document +0.0003279397699428092 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0413_text_document +0.0003197789907967984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0414_text_document +0.00031901270687106177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0415_text_document +0.000321273794216131 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0416_text_document +0.0003220857325921838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0417_text_document +0.00031002969769902754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0418_text_document +0.00031282247512778876 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0419_text_document +0.0003087408247659614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0420_text_document +0.0003000588357430778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0421_text_document +0.0003050525128747414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0422_text_document +0.0003038755807622741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0423_text_document +0.00029692774685276133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0424_text_document +0.0003116160903862434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0425_text_document +0.00031084101832927995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0426_text_document +0.00030708350656830715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0427_text_document +0.00031743538194191725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0428_text_document +0.00031694261996253895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0429_text_document +0.0003146446823405206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0430_text_document +0.00030156651655858596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0431_text_document +0.000303240651608455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0432_text_document +0.00032558453868072364 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0433_text_document +0.0002973680179620588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0434_text_document +0.0002971760577119216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0435_text_document +0.0002973002298006474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0436_text_document +0.0002878620791957177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0437_text_document +0.00029632190555443135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0438_text_document +0.0002946733596926658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0439_text_document +0.00029877307004917556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0440_text_document +0.00029551091884749816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0441_text_document +0.0002976670701108049 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0442_text_document +0.0002888352867396029 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0443_text_document +0.0002866799361024954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0444_text_document +0.0002859222006630905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0445_text_document +0.00028581831052887173 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0446_text_document +0.00028506927387831265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0447_text_document +0.0002803249093757669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0448_text_document +0.0002809203104492272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0449_text_document +0.00028454145587367076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0450_text_document +0.00028584177277598123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0451_text_document +0.00028086934160805217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0452_text_document +0.000270936293938279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0453_text_document +0.00028304258342716634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0454_text_document +0.00028276074943094315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0455_text_document +0.0002602100764561298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0456_text_document +0.00028012504824815937 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0457_text_document +0.0002608944608134916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0458_text_document +0.0002845289889094832 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0459_text_document +0.0002717532367216808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0460_text_document +0.0002643974553814476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0461_text_document +0.0002758213344366294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0462_text_document +0.0002753861114186629 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0463_text_document +0.00031845649723981725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0464_text_document +0.00032153756772406746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0465_text_document +0.0003223378422301534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0466_text_document +0.0002996787108131847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0467_text_document +0.00030486709979224023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0468_text_document +0.00031053773722556385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0469_text_document +0.0003002771838331003 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0470_text_document +0.00029794449770130684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0471_text_document +0.0003033670930430196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0472_text_document +0.0002965031647098184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0473_text_document +0.0002837085032811094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0474_text_document +0.0002828420727162801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0475_text_document +0.00028941167269403106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0476_text_document +0.00029157564190928313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0477_text_document +0.00029812762761704826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0478_text_document +0.0002961388642406645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0479_text_document +0.0002838466433847451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0480_text_document +0.0002788779144959817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0481_text_document +0.0003402152386086791 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0482_text_document +0.00037332501068667467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0483_text_document +0.0002413675200116708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0484_text_document +0.0003704235275199961 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0485_text_document +0.0002379466982220781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0486_text_document +0.00035089333509974934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0487_text_document +0.00023630817154070126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0488_text_document +0.00023857309295728839 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0489_text_document +0.0002435822475458576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0490_text_document +0.00023387703405383536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0491_text_document +0.00034319854187343774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0492_text_document +0.0003622737409420836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0493_text_document +0.00023570573166970698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0494_text_document +0.00022641527241191097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0495_text_document +0.00034243292431352653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0496_text_document +0.00024045245535407698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0497_text_document +0.00023676532885361976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0498_text_document +0.00022335363118071338 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0499_text_document +0.00023448598925498735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0500_text_document +0.00033737048365832474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0501_text_document +0.0003357751601882351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0502_text_document +0.0003383236392673138 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0503_text_document +0.0003397838415177592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0504_text_document +0.00033705937300296186 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0505_text_document +0.00033448155827902774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0506_text_document +0.00034576892094196856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0507_text_document +0.00033674871522955814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0508_text_document +0.0003328110361659434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0509_text_document +0.00032432631363958473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0510_text_document +0.00032731656932112217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0511_text_document +0.00032024116066153716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0512_text_document +0.0003040305172335454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0513_text_document +0.00031659687802842567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0514_text_document +0.000303687860573204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0515_text_document +0.0003155611705529593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0516_text_document +0.00030697272991348575 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0517_text_document +0.00032874805540012775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0518_text_document +0.0003195460475675836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0519_text_document +0.00029999019685462926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0520_text_document +0.0003031992730055188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0521_text_document +0.0003004957313392662 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0522_text_document +0.00029242533089655584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0523_text_document +0.0002940539652538529 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0524_text_document +0.0003042748602544184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0525_text_document +0.00029329988520120374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0526_text_document +0.00028533980088048884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0527_text_document +0.0002995523399640371 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0528_text_document +0.00024445982369612285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0529_text_document +0.0002341949821161716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0530_text_document +0.0002448827406649086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0531_text_document +0.0002464661023748273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0532_text_document +0.0002458273043503861 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0533_text_document +0.000234131092194839 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0534_text_document +0.00023502842288340058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0535_text_document +0.00023472409854696446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0536_text_document +0.0002353934437680525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0537_text_document +0.00023298716740292522 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0538_text_document +0.00023724345571185632 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0539_text_document +0.0002463911915031484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0540_text_document +0.00023298903026561056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0541_text_document +0.00022884149754863258 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0542_text_document +0.00023103945956545342 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0543_text_document +0.0002444088792883614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0544_text_document +0.00022581722858094737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0545_text_document +0.0002370810502668904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0546_text_document +0.00022632319324174496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0547_text_document +0.00023710168144645038 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0548_text_document +0.00022964923090952467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0549_text_document +0.00023748320722538985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0550_text_document +0.0003222624777361089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0551_text_document +0.0002939065142920207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0552_text_document +0.0003163669341858318 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0553_text_document +0.0002875568128154461 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0554_text_document +0.0002891174847690085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0555_text_document +0.0002845830978145091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0556_text_document +0.0002834617830618547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0557_text_document +0.0002825955578364204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0558_text_document +0.0002770681818983043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0559_text_document +0.00027398693963975244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0560_text_document +0.00026761486776881346 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0561_text_document +0.0002709662939745425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0562_text_document +0.0002715205476986883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0563_text_document +0.0002694875173937183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0564_text_document +0.0002691404382855153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0565_text_document +0.0002555696578650904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0566_text_document +0.00025938400199289785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0567_text_document +0.00025330279781755557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0568_text_document +0.00025455190919542185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0569_text_document +0.0002596474980952091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0570_text_document +0.0002593765878092823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0571_text_document +0.00026530976177812846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0572_text_document +0.00026521586959931293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0573_text_document +0.00027156192778243744 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0574_text_document +0.00026542489893346987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0575_text_document +0.0002637742757379441 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0576_text_document +0.0002660391549513622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0577_text_document +0.0002622961692249776 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0578_text_document +0.0002668259130904866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0579_text_document +0.00026393281403990296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0580_text_document +0.0002573087912247817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0581_text_document +0.0002689284845925933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0582_text_document +0.0002587878565641303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0583_text_document +0.0002591277179432351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0584_text_document +0.00025645748667058553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0585_text_document +0.0002576834953920859 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0586_text_document +0.0002574007659976351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0587_text_document +0.00026215195926907863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0588_text_document +0.0002550452573299244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0589_text_document +0.0002580549425113166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0590_text_document +0.0002580184320809385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0591_text_document +0.00026135902243793944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0592_text_document +0.0002499110939933153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0593_text_document +0.00023602977130289638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0594_text_document +0.0002179537404034863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0595_text_document +0.000217790844069029 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0596_text_document +0.00021511798361299487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0597_text_document +0.00025422459968044684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0598_text_document +0.00026310640293852807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0599_text_document +0.0003408740036680742 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0600_text_document +0.00025777786217145044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0601_text_document +0.00025244460970438263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0602_text_document +0.00025351648924446906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0603_text_document +0.0003423231978018855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0604_text_document +0.0003423953052478566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0605_text_document +0.0003318569148201118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0606_text_document +0.00032767638761629247 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0607_text_document +0.00033215390937927025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0608_text_document +0.00032618622802635676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0609_text_document +0.00032507622347617733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0610_text_document +0.00031030763419557833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0611_text_document +0.00024643590119480534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0000_text_document +0.0002095902169870633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0001_text_document +0.00021403593088797332 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0002_text_document +0.0002227102409085263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0003_text_document +0.00020197706221244385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0004_text_document +0.00022874875522106917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0005_text_document +0.00023280550472601052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0006_text_document +0.00022749578163623905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0007_text_document +0.00023802912323224644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0008_text_document +0.00023176496190267302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0009_text_document +0.0002278986856648695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0010_text_document +0.00021833909531790053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0011_text_document +0.0003080057114591217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0012_text_document +0.00021694016663911526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0013_text_document +0.0002668830492707773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0014_text_document +0.00024523658363304193 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0015_text_document +0.0002894756615830288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0016_text_document +0.00020347856162111349 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0017_text_document +0.0002134325832786435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0018_text_document +0.00021673235231198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0019_text_document +0.0002654127125833355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0020_text_document +0.0002158672209137081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0021_text_document +0.00023947604851382316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0022_text_document +0.00026152140024106367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0023_text_document +0.00021518621527788343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0024_text_document +0.0002439782139658387 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0025_text_document +0.0002905141391659118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0026_text_document +0.00021642682185339982 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0027_text_document +0.00019960430947798375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0028_text_document +0.00026322267340937706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0029_text_document +0.00022334429465509248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0030_text_document +0.00022855119280875728 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0031_text_document +0.00028578658731994404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0032_text_document +0.0002584277862839571 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0033_text_document +0.00021861958226794765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0034_text_document +0.00026614391185475836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0035_text_document +0.00028970533715167736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0036_text_document +0.0002235814952215254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0037_text_document +0.00022032188312044515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0038_text_document +0.00022884461811511293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0039_text_document +0.0002551680347396578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0040_text_document +0.00022883355545520197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0041_text_document +0.0002232938120141678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0042_text_document +0.0002691617763064546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0043_text_document +0.00023572139842386745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0044_text_document +0.0002552819803341825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0045_text_document +0.00027155660031106415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0046_text_document +0.00021551548292117663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0047_text_document +0.00020620735756494168 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0048_text_document +0.0002166820604491231 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0049_text_document +0.00018501398539579828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0050_text_document +0.00027225222848112053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0051_text_document +0.00023371832644559636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0052_text_document +0.00023566702124489628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0053_text_document +0.00023686334707090557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0054_text_document +0.00022423975285568458 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0055_text_document +0.0002528257228301147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0056_text_document +0.0002561855163693918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0057_text_document +0.00022810786925037496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0058_text_document +0.0002762405538154904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0059_text_document +0.00022261162863844723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0060_text_document +0.00022540915157909426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0061_text_document +0.00022299985657677767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0062_text_document +0.00022755525774778565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0063_text_document +0.00024165856540482104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0064_text_document +0.00025687628451136137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0065_text_document +0.0002231870244226192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0066_text_document +0.00026580529164370396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0067_text_document +0.00028870521089646587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0068_text_document +0.00021540624754582923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0069_text_document +0.00025778332069476944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0070_text_document +0.00021926796929661694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0071_text_document +0.00026029886649394187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0072_text_document +0.00022285796310592967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0073_text_document +0.00023080628286139754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0074_text_document +0.00025245808263416443 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0075_text_document +0.00022457772027503216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0076_text_document +0.00024435224362284627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0077_text_document +0.00022526086938759533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0078_text_document +0.0002673487094116284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0079_text_document +0.00023263089713557213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0080_text_document +0.00021778225362633044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0081_text_document +0.00021409630017652816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0082_text_document +0.00022267016739539933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0083_text_document +0.00020585884947224638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0084_text_document +0.00022993683686780696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0085_text_document +0.00024242353683668374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0086_text_document +0.0002092411836993767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0087_text_document +0.0002197488902020793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0088_text_document +0.00021875038642425168 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0089_text_document +0.0002494827261520774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0090_text_document +0.00023601123399284122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0091_text_document +0.00021826172481591926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0092_text_document +0.000236632672200321 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0093_text_document +0.00025074570040713444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0094_text_document +0.00022642420961164095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0095_text_document +0.00023812142057551977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0096_text_document +0.0002428821562055837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0097_text_document +0.00022488741946885592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0098_text_document +0.00020317409833506262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0099_text_document +0.00021856439903312987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0100_text_document +0.0002106925714107645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0101_text_document +0.00021119826681040816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0102_text_document +0.0002592340274790045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0103_text_document +0.00023255611509461946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0104_text_document +0.00020894883617804318 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0105_text_document +0.00022615604129768463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0106_text_document +0.000203728797783905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0107_text_document +0.0001989690950208705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0108_text_document +0.00021734356057002846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0109_text_document +0.0002433390106922548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0110_text_document +0.00022031295850762523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0111_text_document +0.00022344289507866802 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0112_text_document +0.00022230083290263739 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0113_text_document +0.00021439002065826426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0114_text_document +0.0002041951415667326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0115_text_document +0.00022877491032651992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0116_text_document +0.00021999090587860643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0117_text_document +0.00025682432698074305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0118_text_document +0.00024400030399295212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0119_text_document +0.00022789294060424558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0120_text_document +0.00021497724986548528 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0121_text_document +0.00023813142494777905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0122_text_document +0.00021895635220322673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0123_text_document +0.00023328497887722523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0124_text_document +0.00022164528342855325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0125_text_document +0.0002484042811809953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0126_text_document +0.00021121568758750245 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0127_text_document +0.00020558498767931708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0128_text_document +0.00024543621326022564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0129_text_document +0.00019902438240619879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0130_text_document +0.00023691721805865155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0131_text_document +0.00021791494779355714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0132_text_document +0.0002240264291639859 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0133_text_document +0.0002473539109425455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0134_text_document +0.0002071473371471445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0135_text_document +0.00021022258828332134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0136_text_document +0.00022311670653909265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0137_text_document +0.00022930107525031038 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0138_text_document +0.0002214421423002716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0139_text_document +0.00021570132519262982 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0140_text_document +0.0002197681200389886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0141_text_document +0.0002800029152388595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0142_text_document +0.00026843440765131945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0143_text_document +0.0002849765317975514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0144_text_document +0.00027096319463304773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0145_text_document +0.00027086227426919104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0146_text_document +0.0002526247335698449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0147_text_document +0.00027363240217034764 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0148_text_document +0.0002623467059155748 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0149_text_document +0.00027346078063921375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0150_text_document +0.00025920642956814055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0151_text_document +0.00025705335691494745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0152_text_document +0.00025922805782841715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0153_text_document +0.0002788336705199961 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0154_text_document +0.00024845909125095083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0155_text_document +0.00028656519284339746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0156_text_document +0.00025647131598268287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0157_text_document +0.0002784068234736532 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0158_text_document +0.0002528120161786896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0159_text_document +0.0002488190053053583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0160_text_document +0.0002704389893183884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0161_text_document +0.00025616941425622545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0162_text_document +0.00026029019534693783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0163_text_document +0.00025685556571703545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0164_text_document +0.00019723833812640722 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0165_text_document +0.0001895418580073486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0166_text_document +0.00019011078486016846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0167_text_document +0.00018779376696334834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0168_text_document +0.00018563641007150188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0169_text_document +0.00018754827458482748 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0170_text_document +0.00019755194962803275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0171_text_document +0.00028610572842390993 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0172_text_document +0.00019902354772130188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0173_text_document +0.00020283251106846995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0174_text_document +0.00018722834815639619 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0175_text_document +0.00018348325202476222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0176_text_document +0.0002739432916909774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0177_text_document +0.00014534657139819037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0178_text_document +0.00015282753276716084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0179_text_document +0.0001549244865585569 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0180_text_document +0.0001465220076427807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0181_text_document +0.00015309131688759006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0182_text_document +0.0001462273984264752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0183_text_document +0.00014903597785697923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0184_text_document +0.0001547302246314982 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0185_text_document +0.0001486478323505694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0186_text_document +0.00014887945296702178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0187_text_document +0.00014582128695700495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0188_text_document +0.00015040846513981096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0189_text_document +0.0001492663985213415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0190_text_document +0.0001491503509128408 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0191_text_document +0.00014485595166153977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0192_text_document +0.00014471245274265675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0193_text_document +0.0001539836098505113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0194_text_document +0.00014269340600113259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0195_text_document +0.0001366015589763494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0196_text_document +0.00014275967558886846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0197_text_document +0.00012216291308335102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0198_text_document +9.860253447438225e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0199_text_document +0.00013395002197992724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0200_text_document +0.00013095775634161855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0201_text_document +0.00013244501748701574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0202_text_document +0.00013344638268905827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0203_text_document +0.00013599432127141194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0204_text_document +0.0001319495730149868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0205_text_document +0.0001286425479982177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0206_text_document +0.0001288175023456875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0207_text_document +0.00014061678080985136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0208_text_document +0.000128553766351679 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0209_text_document +0.00013865417327932483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0210_text_document +0.00012918889813006947 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0211_text_document +0.00013369372633056305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0212_text_document +0.00012818148109232114 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0213_text_document +0.00013087168186794624 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0214_text_document +0.00012209941459024034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0215_text_document +0.0001170049632015973 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0216_text_document +0.00013033065279061172 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0217_text_document +0.00012782387759971287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0218_text_document +0.00012594444140907917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0219_text_document +0.00012747350244869554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0220_text_document +0.00011189052700824495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0221_text_document +0.000118474284791765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0222_text_document +0.00012947220948400783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0223_text_document +0.00011563584378100779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0224_text_document +0.00012898102925965738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0225_text_document +0.000122859118523654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0226_text_document +0.00013841949453733798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0227_text_document +0.00012735223374055142 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0228_text_document +0.00013005120882648248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0229_text_document +0.000133953509788018 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0230_text_document +0.00012898361006981912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0231_text_document +0.00012385687424414202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0232_text_document +0.00012495169231715962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0233_text_document +0.0001334287109141697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0234_text_document +0.0001251557347669207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0235_text_document +0.00012458204389205325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0236_text_document +0.00013142493999218836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0237_text_document +0.0001234876747521603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0238_text_document +0.00011414056156548952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0239_text_document +0.00023536944102421793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0240_text_document +0.00020899836320101376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0241_text_document +0.00020694945512603853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0242_text_document +0.0001985515975806629 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0243_text_document +0.00020332234597425947 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0244_text_document +0.00019901014809176087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0245_text_document +0.00019730742496077176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0246_text_document +0.0002086531104287768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0247_text_document +0.00019880240459684486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0248_text_document +0.0001934729054969894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0249_text_document +0.00020006177554040137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0250_text_document +0.0001941325758266985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0251_text_document +0.00020329878081065027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0252_text_document +0.00020327608562464652 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0253_text_document +0.00019798005487177493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0254_text_document +0.0001954984594242001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0255_text_document +0.0001990223203741723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0256_text_document +0.00019108660381768295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0257_text_document +0.00019716779886134537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0258_text_document +0.0001928475026596504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0259_text_document +0.00019634937526499807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0260_text_document +0.00019298574642019224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0261_text_document +0.00018884134414178089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0262_text_document +0.00018997833083144106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0263_text_document +0.0001905325885044214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0264_text_document +0.00020263821458910917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0265_text_document +0.0002079379871094917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0266_text_document +0.00019785431238092052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0267_text_document +0.00018722610077594935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0268_text_document +0.00019937636744768995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0269_text_document +0.00018558334637361332 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0270_text_document +0.00019000469868035166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0271_text_document +0.0001853064471865308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0272_text_document +0.000189466635918149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0273_text_document +0.00019109828052136198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0274_text_document +0.00018290456266579745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0275_text_document +0.00017877060456109023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0276_text_document +0.00018344271945962216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0277_text_document +0.0001937669621232641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0278_text_document +0.00019434311583686195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0279_text_document +0.0001805150932807986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0280_text_document +0.0001914582846585569 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0281_text_document +0.00020025771498172507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0282_text_document +0.00019924956568197525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0283_text_document +0.000189496868442045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0284_text_document +0.0001929642820365483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0285_text_document +0.0001903124937955297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0286_text_document +0.00019497565890742164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0287_text_document +0.00018960064504727124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0288_text_document +0.00018568951646616373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0289_text_document +0.00018239686989629257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0290_text_document +0.00018605553146990633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0291_text_document +0.0001844096767388669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0292_text_document +0.00017898307999377337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0293_text_document +0.0001739406120499752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0294_text_document +0.0001911537409150027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0295_text_document +0.00017663348174413226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0296_text_document +0.00017913373123918278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0297_text_document +0.00017455805527093036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0298_text_document +0.00017536417503931625 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0299_text_document +0.00017329247651270448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0300_text_document +0.00017912565587258707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0301_text_document +0.00017228776664782256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0302_text_document +0.0001825947205735245 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0303_text_document +0.0001696263054898423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0304_text_document +0.00017175867341643253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0305_text_document +0.0001668734295531042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0306_text_document +0.00016312507834781404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0307_text_document +0.0001687262224636195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0308_text_document +0.00017236097186979052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0309_text_document +0.0002586993024691808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0310_text_document +0.00026219934972577114 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0311_text_document +0.0002566784476550503 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0312_text_document +0.0002530671575343629 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0313_text_document +0.00025526495987018773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0314_text_document +0.0002510505062545801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0315_text_document +0.00024743741398453804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0316_text_document +0.00024882602559273036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0317_text_document +0.00024230881628338428 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0318_text_document +0.00025005854915078414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0319_text_document +0.00024477471955617643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0320_text_document +0.0002480463985551468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0321_text_document +0.00024335328103980772 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0322_text_document +0.00024464696562773777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0323_text_document +0.00023820565587951385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0324_text_document +0.00024537554558786237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0325_text_document +0.00024052017934692743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0326_text_document +0.00023660347377746528 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0327_text_document +0.00023823292504990384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0328_text_document +0.00023564543049854766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0329_text_document +0.0002370415962271789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0330_text_document +0.00023453319757168757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0331_text_document +0.000236480621339876 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0332_text_document +0.0002391149628895737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0333_text_document +0.00023165934662137285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0334_text_document +0.00023331169915961683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0335_text_document +0.0002348226454144718 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0336_text_document +0.00023564045570745751 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0337_text_document +0.00016411316830860297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0338_text_document +0.0002007359738791159 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0339_text_document +0.00019930606930833604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0340_text_document +0.00019598670739211644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0341_text_document +0.00019115600211637036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0342_text_document +0.00018957338451495675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0343_text_document +0.0001997256344570198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0344_text_document +0.0001924339501051294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0345_text_document +0.0001929492409258573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0346_text_document +0.00019129356692417672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0347_text_document +0.0001927097658307402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0348_text_document +0.00018744016832935095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0349_text_document +0.00018898826127054628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0350_text_document +0.00019337725386559253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0351_text_document +0.00018434878571055096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0352_text_document +0.00018454731188528818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0353_text_document +0.00018197801455061398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0354_text_document +0.00018615322144032256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0355_text_document +0.00017981075274777777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0356_text_document +0.00018028813451030057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0357_text_document +0.0001760055343765487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0358_text_document +0.00018306121836089844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0359_text_document +0.00018110213343756692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0360_text_document +0.00017839531596627688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0361_text_document +0.00017668405792307465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0362_text_document +0.00018382867977972885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0363_text_document +0.00017812146256462094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0364_text_document +0.00017866992260811773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0365_text_document +0.00017457542446637375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0366_text_document +0.00017144357690622488 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0367_text_document +0.00017669299438239817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0368_text_document +0.00017721730286035934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0369_text_document +0.0002573630336497748 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0370_text_document +0.00025158500395961657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0371_text_document +0.00025871208953576674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0372_text_document +0.0002522219361597465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0373_text_document +0.00025035546177162626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0374_text_document +0.00024714234522261514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0375_text_document +0.00024296206951019436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0376_text_document +0.00023797488747091152 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0377_text_document +0.0002417964809184933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0378_text_document +0.0002317015633644362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0379_text_document +0.00023529081059722227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0380_text_document +0.00022865050303533797 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0381_text_document +0.00022350627510674308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0382_text_document +0.00022416076407195612 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0383_text_document +0.0002237152481700081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0384_text_document +0.00022673308251184112 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0385_text_document +0.00021988509315558021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0386_text_document +0.00021791186375379613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0387_text_document +0.00021902394687174658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0388_text_document +0.00022390913224296354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0389_text_document +0.0002159569838456253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0390_text_document +0.000193074631476835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0391_text_document +0.00019772710141722856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0392_text_document +0.0001918863050023569 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0393_text_document +0.0001968641761834432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0394_text_document +0.00019269495646727515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0395_text_document +0.0001986463032193898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0396_text_document +0.0001855871337995234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0397_text_document +0.00019041152711008963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0398_text_document +0.00018277849340888642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0399_text_document +0.00018810546599505484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0400_text_document +0.00018711834399232793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0401_text_document +0.000180125082690484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0402_text_document +0.00023744084906469025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0403_text_document +0.00023803845013258319 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0404_text_document +0.00023586547263857976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0405_text_document +0.00023222402329423718 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0406_text_document +0.00023270999204422837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0407_text_document +0.00023378783679246331 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0408_text_document +0.00017304047941651873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0409_text_document +0.00017585076104150352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0410_text_document +0.00017101296884180275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0411_text_document +0.00017561096140154923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0412_text_document +0.0001713420333669203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0413_text_document +0.00022603582939637927 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0414_text_document +0.0001703733924033566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0415_text_document +0.0002396801442728503 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0416_text_document +0.0001676400523382032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0417_text_document +0.00017193438273170229 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0418_text_document +0.00017529021040710947 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0419_text_document +0.0001630741415909194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0420_text_document +0.00024179471702347313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0421_text_document +0.00016581358754145113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0422_text_document +0.0002456894490564403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0423_text_document +0.0002456073517995372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0424_text_document +0.00024937580109172706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0425_text_document +0.0002457208726475487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0426_text_document +0.00024399607429757567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0427_text_document +0.00023977003702270238 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0428_text_document +0.0002453131498067917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0429_text_document +0.0001621090466807557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0430_text_document +0.00024557101413066944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0431_text_document +0.00024662307150866836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0432_text_document +0.00015758980646827074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0433_text_document +0.00024391288666874046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0434_text_document +0.00023509503922816786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0435_text_document +0.00023489539281843744 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0436_text_document +0.00023286637378893443 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0437_text_document +0.00023379369093964089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0438_text_document +0.00023205784424428202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0439_text_document +0.00023009948269807432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0440_text_document +0.00023187584394201576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0441_text_document +0.00023202252759594008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0442_text_document +0.00022728777233539934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0443_text_document +0.00022582666382743133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0444_text_document +0.00022616733175598707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0445_text_document +0.00022768677294110565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0446_text_document +0.00022367789565066836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0447_text_document +0.00022752055218158585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0448_text_document +0.00021819243338256605 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0449_text_document +0.0002241455531613807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0450_text_document +0.00022437797440403226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0451_text_document +0.00022445007197791702 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0452_text_document +0.00022150502971124016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0453_text_document +0.0002225145672731263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0454_text_document +0.00022368982014371355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0455_text_document +0.00022402755606263736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0456_text_document +0.00023016090138940315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0457_text_document +0.0002260342841680707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0458_text_document +0.00022458279279977673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0459_text_document +0.00021839974448010203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0460_text_document +0.0002264409368746725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0461_text_document +0.000223550215762877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0462_text_document +0.00021610601829010048 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0463_text_document +0.00022408120517524368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0464_text_document +0.00021671066876802013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0465_text_document +0.00016072298972169641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0466_text_document +0.0001722371396276357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0467_text_document +0.00017303760343097654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0468_text_document +0.00016820735177759604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0469_text_document +0.0001782239553050235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0470_text_document +0.0001749477598265696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0471_text_document +0.0001700037698924768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0472_text_document +0.0001721297434219665 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0473_text_document +0.00017082606704868714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0474_text_document +0.00017400024710211123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0475_text_document +0.00017016210162102983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0476_text_document +0.00016745166973214216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0477_text_document +0.0001684428163376526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0478_text_document +0.0001648685852885396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0479_text_document +0.00017387645508870812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0480_text_document +0.00016594906144137858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0481_text_document +0.00016042654972698604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0482_text_document +0.00014860104507835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0483_text_document +0.00016227281398002708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0484_text_document +0.00016502091577582913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0485_text_document +0.00016106235650927743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0486_text_document +0.00015987309712264371 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0487_text_document +0.0001642815421701454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0488_text_document +0.00016531915249024665 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0489_text_document +0.00015833872193897052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0490_text_document +0.00015639158495488916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0491_text_document +0.00015342548972376501 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0492_text_document +0.00015518921543764528 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0493_text_document +0.0001621958240469728 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0494_text_document +0.00015155749799598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0495_text_document +0.00014939896262383117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0496_text_document +0.00015490092509698006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0497_text_document +0.00017977881778259884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0498_text_document +0.00018596378104021417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0499_text_document +0.00017898738743182946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0500_text_document +0.00018286541046512472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0501_text_document +0.00018092409134830376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0502_text_document +0.00017788220095337013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0503_text_document +0.00017903547090898037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0504_text_document +0.0001797342122414524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0505_text_document +0.00018405110997743763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0506_text_document +0.00016587458814992502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0507_text_document +0.00018323507493237133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0508_text_document +0.00017881236669457928 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0509_text_document +0.00017083385044833047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0510_text_document +0.0001730201559992492 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0511_text_document +0.00016901593018907565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0512_text_document +0.00017121838351155997 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0513_text_document +0.0001762157419442059 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0514_text_document +0.00017000047903250774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0515_text_document +0.00017628842147757824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0516_text_document +0.0001760014416563697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0517_text_document +0.00017080626611158523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0518_text_document +0.00017077485831581488 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0519_text_document +0.0001740210774510124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0520_text_document +0.00017310752988628116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0521_text_document +0.00016563538206915967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0522_text_document +0.0001698038028867437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0523_text_document +0.00022989652913943246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0524_text_document +0.00023802118237282655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0525_text_document +0.00023209291976691602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0526_text_document +0.00023478978296678473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0527_text_document +0.00023185674392304132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0528_text_document +0.0002223151271899996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0529_text_document +0.0002212980337800594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0530_text_document +0.0002177142043482363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0531_text_document +0.00022071160791386127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0532_text_document +0.0002155092901614389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0533_text_document +0.00021709946336410436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0534_text_document +0.0002091085371649664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0535_text_document +0.00021301299764538067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0536_text_document +0.00020514046046681228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0537_text_document +0.00020554350961511138 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0538_text_document +0.0002032929572669402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0539_text_document +0.00020017696773262392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0540_text_document +0.0002041760983122544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0541_text_document +0.00019610775249750582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0542_text_document +0.0001972797535028649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0543_text_document +0.00019987201182946655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0544_text_document +0.00023221090921479249 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0545_text_document +0.00022866265656078542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0546_text_document +0.00022846213721182363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0547_text_document +0.00022028779604045222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0548_text_document +0.00023019534411130514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0549_text_document +0.00021499063838892918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0550_text_document +0.0002238747556640398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0551_text_document +0.000219139079337847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0552_text_document +0.00022466810662919942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0553_text_document +0.00021354111452743537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0554_text_document +0.0002116352569318229 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0555_text_document +0.00021742490236552721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0556_text_document +0.00020976053145397075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0557_text_document +0.0002121893598598504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0558_text_document +0.00020611700008662688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0559_text_document +0.00020771394257887023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0560_text_document +0.00020861778045311834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0561_text_document +0.00020549717473124685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0562_text_document +0.00021168253336591858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0563_text_document +0.00020292362079976103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0564_text_document +0.0002053579978117472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0565_text_document +0.0002025742316233632 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0566_text_document +0.00019721191770863706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0567_text_document +0.00020263891920926902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0568_text_document +0.0002047513235561355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0569_text_document +0.0002058192920224309 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0570_text_document +0.00020762611235464895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0571_text_document +0.00020536767369033477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0572_text_document +0.000208726602681654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0573_text_document +0.00020670689006790867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0574_text_document +0.0001987029852837105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0575_text_document +0.00019743671572624558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0576_text_document +0.00020347237873346202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0577_text_document +0.00019483561225711876 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0578_text_document +0.00019876706376189147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0579_text_document +0.00019418407035646924 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0580_text_document +0.00019094739234588127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0581_text_document +0.00018896169178427298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0582_text_document +0.00019336957140803166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0583_text_document +0.00019246034436187084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0584_text_document +0.00019234601030075014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0585_text_document +0.00018937638801999214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0586_text_document +0.00019243149393005724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0587_text_document +0.00018564518487541217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0588_text_document +0.00018349694905090308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0589_text_document +0.00018632405912780405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0590_text_document +0.0001859374743982387 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0591_text_document +0.00018735943662878573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0592_text_document +0.00018429223346416512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0593_text_document +0.00018743951405683122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0594_text_document +0.0002231790070545305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0595_text_document +0.00023691491440731282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0596_text_document +0.00022732583835977663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0597_text_document +0.00023280690754947414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0598_text_document +0.00023098339919576762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0599_text_document +0.00022742109041848038 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0600_text_document +0.00023387941495424947 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0601_text_document +0.00022226509841824269 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0602_text_document +0.00022342786655488707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0603_text_document +0.00022237713376406775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0604_text_document +0.00021379459835981835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0605_text_document +0.00021934823034546768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0606_text_document +0.00022299117012803982 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0607_text_document +0.0002249652818475372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0608_text_document +0.00021549803647665793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0609_text_document +0.00021082391557018925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0610_text_document +0.0002063290532408184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0611_text_document +0.0002098859538424268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0612_text_document +0.00020927123951292785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0613_text_document +0.00020988114416198002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0614_text_document +0.00020708947339409333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0615_text_document +0.00020681735599881374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0616_text_document +0.00020862989695824213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0617_text_document +0.00017921617624032585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0618_text_document +0.0001869630178204498 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0619_text_document +0.0001837421970952879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0620_text_document +0.0002703540624747488 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0621_text_document +0.0002773524903329593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0622_text_document +0.00026751943505093036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0623_text_document +0.00026849089128670544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0624_text_document +0.00017768273890485142 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0625_text_document +0.00026694167218726514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0626_text_document +0.00026851367038544275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0627_text_document +0.00017178448275206052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0628_text_document +0.00026146356857229295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0629_text_document +0.0002631494175929668 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0630_text_document +0.00026756049947472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0631_text_document +0.0002600735435281443 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0632_text_document +0.00026162102069795645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0633_text_document +0.0002546230805208093 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0634_text_document +0.00025384118907342997 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0635_text_document +0.00024898898905737453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0636_text_document +0.0002560021645785107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0637_text_document +0.00025001876340897294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0638_text_document +0.00024817567624010623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0639_text_document +0.00025419118513633326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0640_text_document +0.00025520008446783997 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0641_text_document +0.00024355226527934937 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0642_text_document +0.00024233948860872504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0643_text_document +0.00024413553528635867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0644_text_document +0.00024287456234999737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0645_text_document +0.0002471744870080021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0646_text_document +0.00024318841473052868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0647_text_document +0.00024268080340573577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0648_text_document +0.000242363177173413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0649_text_document +0.00025310826613573865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0650_text_document +0.0002450433802404371 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0651_text_document +0.0002429196089265994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0652_text_document +0.00023818874203405117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0653_text_document +0.00023814010078402416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0654_text_document +0.0002258262625271231 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0655_text_document +0.0002359106231188901 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0656_text_document +0.00023984369117779496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0657_text_document +0.00022677878582898447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0658_text_document +0.00023019334994987196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0659_text_document +0.0002326106169086802 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0660_text_document +0.00023296218608853588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0661_text_document +0.00021930251468821644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0662_text_document +0.00022685746290158792 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0663_text_document +0.00022204375118840136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0664_text_document +0.00022312982876300855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0665_text_document +0.00022347955655196657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0666_text_document +0.00021968416238742178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0667_text_document +0.00022148339454050315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0668_text_document +0.00022133417129237745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0669_text_document +0.00021840518548046784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0670_text_document +0.00021501258675160414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0671_text_document +0.00016302293581967305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0672_text_document +0.00018778970953587786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0673_text_document +0.00018910837173616491 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0674_text_document +0.00019201324078164315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0675_text_document +0.00018432054093123207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0676_text_document +0.0001907749590824511 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0677_text_document +0.00017971831966331778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0678_text_document +0.00018425384289495064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0679_text_document +0.00018353447605936826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0680_text_document +0.00018776194922919426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0681_text_document +0.000181858547251418 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0682_text_document +0.00017663862855632625 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0683_text_document +0.00017879513620194847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0684_text_document +0.00017779569087388971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0685_text_document +0.00017180299093946108 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0686_text_document +0.00018535528192944938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0687_text_document +0.0001710147287879884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0688_text_document +0.000173577199328182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0689_text_document +0.0001768576763304655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0690_text_document +0.0001796376911260544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0691_text_document +0.00016782824293218567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0692_text_document +0.00016074324428116396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0693_text_document +0.0001687513348299545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0694_text_document +0.00016077518171436444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0695_text_document +0.00017132313128327624 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0696_text_document +0.00016190186959679132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0697_text_document +0.00016290842504820753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0698_text_document +0.00016156811558387776 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0699_text_document +0.00023555665280084346 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0700_text_document +0.0002284718177796522 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0701_text_document +0.00022374123273516798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0702_text_document +0.00021994541999416394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0703_text_document +0.00022338573100973358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0704_text_document +0.00022049767881647008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0705_text_document +0.00022416439897413284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0706_text_document +0.00021895546198784436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0707_text_document +0.0002142388294097341 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0708_text_document +0.00022017166748084383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0709_text_document +0.00021104350754308596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0710_text_document +0.00020985391201191366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0711_text_document +0.00021778183924550787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0712_text_document +0.00021271266854227129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0713_text_document +0.0002086433619903549 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0714_text_document +0.00021568150697937684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0715_text_document +0.00020764802098217656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0716_text_document +0.0002151205404833473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0717_text_document +0.00020430590834946775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0718_text_document +0.00020866543326050432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0719_text_document +0.00020818319961436583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0720_text_document +0.00020070798626764516 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0721_text_document +0.00019693995826673832 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0722_text_document +0.00020030234076064843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0723_text_document +0.00019788654054706263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0724_text_document +0.0001993257554824347 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0725_text_document +0.00021386012383904914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0726_text_document +0.00021978412787373083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0727_text_document +0.0002175599344895926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0728_text_document +0.00021091594587352813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0729_text_document +0.0002034137316303627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0730_text_document +0.00021253423082914959 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0731_text_document +0.00020816999471172712 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0732_text_document +0.00021853522405647908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0733_text_document +0.0002051944662085363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0734_text_document +0.00020978726975291983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0735_text_document +0.00020468921406556763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0736_text_document +0.0002007846124143192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0737_text_document +0.00020366090300396152 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0738_text_document +0.0001993156168498017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0739_text_document +0.00020150340666889603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0740_text_document +0.00020188286325854645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0741_text_document +0.00020072267667247027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0742_text_document +0.00019591912629771525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0743_text_document +0.00020056463740447396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0744_text_document +0.0001962511050627094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0745_text_document +0.00018969020412060633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0746_text_document +0.00018711981666080213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0747_text_document +0.00019064086480658448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0748_text_document +0.0001893430509717561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0749_text_document +0.00018823938035214858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0750_text_document +0.000191049243153872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0751_text_document +0.00015215085209234548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0752_text_document +0.00013881666461144156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0753_text_document +0.0001511979467407442 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0754_text_document +0.00015091819106548992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0755_text_document +0.00013896830454629422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0756_text_document +0.00014286084497610213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0757_text_document +0.00013829013170563417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0758_text_document +0.00014842506748913496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0759_text_document +0.0001621698681108632 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0760_text_document +0.00025658329333000087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0761_text_document +0.0002625776226522738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0762_text_document +0.00018893904126945972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0763_text_document +0.00019173419836462428 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0764_text_document +0.00024972708669590365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0765_text_document +0.0002565621075859928 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0766_text_document +0.0002548091984702725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0767_text_document +0.00024781120449025493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0768_text_document +0.00024190263274768403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0769_text_document +0.00024935480538538027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0770_text_document +0.00024565807926820224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0771_text_document +0.00024335665926774057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0772_text_document +0.0002407471035651234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0773_text_document +0.00024409063432302957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0774_text_document +0.00025048184051844287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0775_text_document +2.7431736503196682e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0776_text_document +0.0001542652540558753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0000_text_document +0.0001414689533672357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0001_text_document +0.00014218991553196462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0002_text_document +0.00014380616486339045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0003_text_document +0.00014537826992690233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0004_text_document +0.00015240156803853129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0005_text_document +0.0001508299161037807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0006_text_document +0.0001645724380011881 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0007_text_document +0.0001636434127327491 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0008_text_document +0.0001425695379726649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0009_text_document +0.00015038309042278246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0010_text_document +0.00015551331010771582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0011_text_document +0.00014395190746068794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0012_text_document +0.00014572155617954775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0013_text_document +0.00014985257363654754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0014_text_document +0.00016517178815597176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0015_text_document +0.00015368391453534256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0016_text_document +0.00013802907993189142 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0017_text_document +0.0001438832947332681 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0018_text_document +0.0001453654604013201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0019_text_document +0.00015126685069470999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0020_text_document +0.00014666492015973732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0021_text_document +0.00015372684675786069 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0022_text_document +0.0001466694423156705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0023_text_document +0.00014645983052842166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0024_text_document +0.00014464707855314855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0025_text_document +0.00014224079429035223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0026_text_document +0.00015150561574001976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0027_text_document +0.00014869251464718684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0028_text_document +0.00014975351070572874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0029_text_document +0.00014693519813853656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0030_text_document +0.00015177096878176463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0031_text_document +0.0001541385774188545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0032_text_document +0.00014686140972307025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0033_text_document +0.00014836061485888312 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0034_text_document +0.00015908940031748178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0035_text_document +0.00014335960523511807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0036_text_document +0.00014014336145596836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0037_text_document +0.00014804788542816872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0038_text_document +0.00014447262570766296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0039_text_document +0.0001490836674378867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0040_text_document +0.00015491171627451768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0041_text_document +0.00014704465686983656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0042_text_document +0.00015578029994136968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0043_text_document +0.00014442509556094932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0044_text_document +0.00016401352835433973 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0045_text_document +0.0001426617272165932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0046_text_document +0.00014952006301290383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0047_text_document +0.00014858509055287383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0048_text_document +0.0001452147802800582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0049_text_document +0.00014648995026373163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0050_text_document +0.000150292569067835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0051_text_document +0.00015359505638013499 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0052_text_document +0.00014342220561517732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0053_text_document +0.00015037020981817882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0054_text_document +0.0001442503228598675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0055_text_document +0.00015512168691210362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0056_text_document +0.000141978855262853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0057_text_document +0.0001433238477981227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0058_text_document +0.0001522852753554881 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0059_text_document +0.00015750021259583146 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0060_text_document +0.0001620583984355833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0061_text_document +0.00014425968431250636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0062_text_document +0.00015502607180742606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0063_text_document +0.00014808719854384823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0064_text_document +0.00014037741406088144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0065_text_document +0.00014415351915599912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0066_text_document +0.00014669998038063754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0067_text_document +0.00014168851942590583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0068_text_document +0.00016615444649487683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0069_text_document +0.00017314227247280456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0070_text_document +0.00014511886160872687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0071_text_document +0.0001589885117911034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0072_text_document +0.0001468857466370262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0073_text_document +0.00014409172483178647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0074_text_document +0.00017524066610798787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0075_text_document +0.0001423201779575328 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0076_text_document +0.00014813204150867026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0077_text_document +0.0001426495065609589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0078_text_document +0.00015198519700337085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0079_text_document +0.0001407239353962083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0080_text_document +0.00015564799275992607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0081_text_document +0.00014044706039573722 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0082_text_document +0.00014271692599994692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0083_text_document +0.000145622079855115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0084_text_document +0.0001420329587382314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0085_text_document +0.00014388823447845187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0086_text_document +0.0001386395317413269 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0087_text_document +0.00014615498445222442 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0088_text_document +0.00014100731560794867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0089_text_document +0.0001412468938663676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0090_text_document +0.0001448361986040564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0091_text_document +0.00015041376595655126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0092_text_document +0.00015393889374199827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0093_text_document +0.0001424230223910099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0094_text_document +0.00013832238850082653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0095_text_document +0.00014573052620396468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0096_text_document +0.00014871061906625763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0097_text_document +0.0001474653563212365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0098_text_document +0.00014332440162216428 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0099_text_document +0.00013995360169386805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0100_text_document +0.0001396957447740551 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0101_text_document +0.00014451429874557317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0102_text_document +0.00014667057760559536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0103_text_document +0.00014311302174425863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0104_text_document +0.0001486303888676766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0105_text_document +0.00014984904337848564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0106_text_document +0.00014471364010783683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0107_text_document +0.00014422564733335141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0108_text_document +0.00014833706425660122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0109_text_document +0.0001547519654335586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0110_text_document +0.00016861028196725518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0111_text_document +0.00014655906054430117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0112_text_document +0.00016230445673145143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0113_text_document +0.0001608744287595928 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0114_text_document +0.00014838797263124772 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0115_text_document +0.00013772432541929463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0116_text_document +0.00014338391080519997 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0117_text_document +0.00013969596121954725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0118_text_document +0.00014433977111903384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0119_text_document +0.00013940910504621967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0120_text_document +0.00013841015875212353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0121_text_document +0.00015084460181936482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0122_text_document +0.00015609034169658813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0123_text_document +0.00014131566380676185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0124_text_document +0.0001489310284479002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0125_text_document +0.000142611271970708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0126_text_document +0.00013893968956373896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0127_text_document +0.00014729183194546773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0128_text_document +0.00013844600256987405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0129_text_document +0.00014038359448051134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0130_text_document +0.00014148398954188355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0131_text_document +0.00014453817241187933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0132_text_document +0.00014513564218102443 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0133_text_document +0.00013715869534969562 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0134_text_document +0.00013954027841855143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0135_text_document +0.0001412204761634212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0136_text_document +0.00014007212448120704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0137_text_document +0.00013479800901952131 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0138_text_document +0.00014295404043242684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0139_text_document +0.00013573518591642275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0140_text_document +0.00013489486257742554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0141_text_document +0.00014869208126259815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0142_text_document +0.0001475290332523071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0143_text_document +0.00013460777613768496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0144_text_document +0.00013367855194670696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0145_text_document +0.00014765624643721848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0146_text_document +0.000134270744838921 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0147_text_document +0.0001482262234332188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0148_text_document +0.00013864007544648005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0149_text_document +0.00014100224826604942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0150_text_document +0.00013048509121512907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0151_text_document +0.00012998584056022605 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0152_text_document +0.00013999889926826433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0153_text_document +0.00015896524130927037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0154_text_document +0.00013250422088217822 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0155_text_document +0.00013171712561130657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0156_text_document +0.0001353745598377907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0157_text_document +0.00014385989862913682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0158_text_document +0.0001337953809308385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0159_text_document +0.00013398910556960017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0160_text_document +0.00013559702582181232 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0161_text_document +0.0001479395819777683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0162_text_document +0.00013591800338063272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0163_text_document +0.0001378006151746279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0164_text_document +0.0001348466006623147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0165_text_document +0.0001333909410523815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0166_text_document +0.0001395413623736275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0167_text_document +0.0001340932768114764 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0168_text_document +0.0001401874805500622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0169_text_document +0.00013331732046238236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0170_text_document +0.00013697107190707125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0171_text_document +0.00014902856836260464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0172_text_document +0.0001360841851454116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0173_text_document +0.0001430140955418351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0174_text_document +0.00013769551793105646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0175_text_document +0.00013655894858384475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0176_text_document +0.00013428329448183135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0177_text_document +0.00013659792851661152 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0178_text_document +0.0001390035871359384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0179_text_document +0.00013373343260207954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0180_text_document +0.0001377384027675603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0181_text_document +0.00013688101750180593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0182_text_document +0.00013942483868376482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0183_text_document +0.00013069676947684327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0184_text_document +0.00013248181223347942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0185_text_document +0.00013404120081582244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0186_text_document +0.0001361765930560515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0187_text_document +0.00012895481023244784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0188_text_document +0.0001269948854413741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0189_text_document +0.00012923062571125647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0190_text_document +0.00013146444734116587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0191_text_document +0.00012866221788337398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0192_text_document +0.00012734919091675074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0193_text_document +0.00012491017673902597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0194_text_document +0.00012532940253474304 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0195_text_document +0.00012942822588429847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0196_text_document +0.00012954876208363892 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0197_text_document +0.00012757889363363662 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0198_text_document +0.00012925483823692497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0199_text_document +0.00012887114306702046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0200_text_document +0.00014305235456549959 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0201_text_document +0.00012924991650829868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0202_text_document +0.00013939886744592149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0203_text_document +0.00013473816912159447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0204_text_document +0.00012443870588817695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0205_text_document +0.00012352413384768962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0206_text_document +0.00012363992848397884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0207_text_document +0.00012876521187895858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0208_text_document +0.00012998676310669105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0209_text_document +0.00013573707197851088 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0210_text_document +0.00012914628304832383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0211_text_document +0.00012135846145074816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0212_text_document +0.0001272491158502837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0213_text_document +0.00014048669089899133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0214_text_document +0.00012821863542952837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0215_text_document +0.00012843614908145614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0216_text_document +0.00012566972592748682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0217_text_document +0.00012623965035462757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0218_text_document +0.00012745682281848042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0219_text_document +0.00012684031670531754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0220_text_document +0.00013734922167929733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0221_text_document +0.00012364311692105248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0222_text_document +0.00012150014908859676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0223_text_document +0.00013255947544281956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0224_text_document +0.00013080450775030287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0225_text_document +0.00012642072366799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0226_text_document +0.00012748944981690917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0227_text_document +0.0001272640012288133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0228_text_document +0.00012676436334132504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0229_text_document +0.00011463874381385243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0230_text_document +0.0001259763726722257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0231_text_document +0.00013265355691888996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0232_text_document +0.00012800075083395775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0233_text_document +0.00012600035320386608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0234_text_document +0.00012796669337022373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0235_text_document +0.0001281363666451258 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0236_text_document +0.00013103924202277517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0237_text_document +0.00013710099201804686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0238_text_document +0.00012390494315996567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0239_text_document +0.00012375130141281296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0240_text_document +0.00012654460329615904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0241_text_document +0.00013347917998097572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0242_text_document +0.00012957465780002206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0243_text_document +0.00012365300899833007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0244_text_document +0.00012759104863989702 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0245_text_document +0.00012669826503428652 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0246_text_document +0.00019585621938937627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0247_text_document +0.00017199211271798405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0248_text_document +0.00017151003159557893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0249_text_document +0.0001704765251017538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0250_text_document +0.00016915887010107177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0251_text_document +0.000164507074917777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0252_text_document +0.0001707345009802067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0253_text_document +0.00016235282921392888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0254_text_document +0.0001603312806389334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0255_text_document +0.00016063927887228715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0256_text_document +0.0001682293216120587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0257_text_document +0.00016945118701893779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0258_text_document +0.00016510575549830714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0259_text_document +0.00015878514261762818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0260_text_document +0.00016058925849180358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0261_text_document +0.00016806270202025228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0262_text_document +0.0001601743221175851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0263_text_document +0.00016728028661189246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0264_text_document +0.00016271828570438892 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0265_text_document +0.0001663197659329172 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0266_text_document +0.00016211838369998094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0267_text_document +0.00016174818095722866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0268_text_document +0.00016660916885770873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0269_text_document +0.00016749279166083448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0270_text_document +0.00015990162967327836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0271_text_document +0.00016050019425679443 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0272_text_document +0.00015826664805809287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0273_text_document +0.00015906002765230277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0274_text_document +0.00016496336225309003 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0275_text_document +0.00015969348413764765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0276_text_document +0.00015888249989873604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0277_text_document +0.0001588217905168081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0278_text_document +0.0001579176192128451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0279_text_document +0.0001599592014593771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0280_text_document +0.00015860202306757735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0281_text_document +0.00015475539919197688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0282_text_document +0.0001606154789998261 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0283_text_document +0.00015967691482799697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0284_text_document +0.00015467004809542842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0285_text_document +0.00015681467419158087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0286_text_document +0.0001622263618651377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0287_text_document +0.00016071879902106084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0288_text_document +0.00015926245724996415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0289_text_document +0.00015865169965265541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0290_text_document +0.0001558589009989086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0291_text_document +0.00015834413702510978 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0292_text_document +0.00015984235618630313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0293_text_document +0.00015906347325722462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0294_text_document +0.0001540401129832678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0295_text_document +0.00015709268423517463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0296_text_document +0.00016150611616707217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0297_text_document +0.0001575761279522917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0298_text_document +0.00015145845456253164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0299_text_document +0.00015531545597525365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0300_text_document +0.00015290580088858923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0301_text_document +0.00015077381822016696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0302_text_document +0.00016026706987479596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0303_text_document +0.00015143811781794564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0304_text_document +0.00015335594803302406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0305_text_document +0.00015760769888428818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0306_text_document +0.00016811053178478525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0307_text_document +0.00021456946285616728 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0308_text_document +0.00021300214303968855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0309_text_document +0.00020349194545531642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0310_text_document +0.00021281325399560017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0311_text_document +0.00020973400589848146 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0312_text_document +0.00020126033912157333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0313_text_document +0.00020674507357011296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0314_text_document +0.00021222543863325493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0315_text_document +0.0002050723383820817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0316_text_document +0.00021804813803312056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0317_text_document +0.0002008803314227051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0318_text_document +0.0002150047024098784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0319_text_document +0.00020318723314588857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0320_text_document +0.00020021945595806058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0321_text_document +0.00020351797666608406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0322_text_document +0.00020832621085218548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0323_text_document +0.0002035206051090622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0324_text_document +0.00020272338181805027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0325_text_document +0.00020460676190716195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0326_text_document +0.00020717814792849565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0327_text_document +0.0001955827435950214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0328_text_document +0.00020417807396352577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0329_text_document +0.0002011029760914888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0330_text_document +0.00020292765823625672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0331_text_document +0.00020035339845060027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0332_text_document +0.0002019662525247444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0333_text_document +0.000206838061219021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0334_text_document +0.0001941713531348939 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0335_text_document +0.00010283055875342613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0336_text_document +0.00010052128921034293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0337_text_document +0.00021410992316202177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0338_text_document +0.0002019050315219438 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0339_text_document +0.0001986035353671086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0340_text_document +0.00019334420113344198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0341_text_document +0.0002040134561840194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0342_text_document +0.00019786749210973914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0343_text_document +0.00021292248961774976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0344_text_document +0.000198975254462317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0345_text_document +0.00019270601369753864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0346_text_document +0.0001938662101557011 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0347_text_document +0.0002029610545170106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0348_text_document +0.0002024962737322469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0349_text_document +0.0002075197885043544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0350_text_document +0.00019871985248356538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0351_text_document +0.0001949694696029141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0352_text_document +0.00020180408203543252 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0353_text_document +0.00019545199817763088 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0354_text_document +0.00019734611243298183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0355_text_document +0.00021047242956266074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0356_text_document +0.0001968562822164333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0357_text_document +0.00019972266049102106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0358_text_document +0.00020126122390730825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0359_text_document +0.00019799642896091898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0360_text_document +0.0002021712802087185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0361_text_document +0.0001941903201275054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0362_text_document +0.00019307283352311706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0363_text_document +0.00016598318480128866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0364_text_document +0.00016504803365649659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0365_text_document +0.00016630327313193533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0366_text_document +0.00016601923469884076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0367_text_document +0.0001681694501234557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0368_text_document +0.00016859564709291555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0369_text_document +0.00016845101707974437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0370_text_document +0.0001643037792913447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0371_text_document +0.00016186624765418046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0372_text_document +0.00016697344045101027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0373_text_document +0.00016669715111205908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0374_text_document +0.00016364850623567704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0375_text_document +0.0001634811496926281 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0376_text_document +0.00016825687707295152 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0377_text_document +0.0001627585946667742 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0378_text_document +0.00016582351614544805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0379_text_document +0.0001630893218980273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0380_text_document +0.0001568416151151013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0381_text_document +0.00017273044852059518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0382_text_document +0.00016016530273273665 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0383_text_document +0.00015777742226002822 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0384_text_document +0.00016385370668116144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0385_text_document +0.00016954547679602915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0386_text_document +0.0001676626705219338 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0387_text_document +0.00016250610371947111 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0388_text_document +0.00016004510983519738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0389_text_document +0.000161815220320894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0390_text_document +0.00016744693680716642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0391_text_document +0.00015604191096880147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0392_text_document +0.0001636895622681933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0393_text_document +0.000158886517344257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0394_text_document +0.0001558972054341701 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0395_text_document +0.0001591533045533395 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0396_text_document +0.0001657955386528658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0397_text_document +0.00016060726764524156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0398_text_document +0.00016167923208527019 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0399_text_document +0.00015484394662326808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0400_text_document +0.00016052047349647775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0401_text_document +0.0001582576585363055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0402_text_document +0.0001545777833300399 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0403_text_document +0.00016303475566860345 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0404_text_document +0.0001627904173369714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0405_text_document +0.0001567550665344843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0406_text_document +0.0001587287727580368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0407_text_document +0.0001606889088117574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0408_text_document +0.00016206324217472778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0409_text_document +0.00015712668987045555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0410_text_document +0.0001607143430081059 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0411_text_document +0.00015230600229428526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0412_text_document +0.00016067822548676263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0413_text_document +0.00015993580979768466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0414_text_document +0.00016379843410396262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0415_text_document +0.0001533135627240871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0416_text_document +0.00016861285265852845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0417_text_document +0.0001632799417656467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0418_text_document +0.00015962871905586431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0419_text_document +0.00015014915949133304 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0420_text_document +0.00015059096546324844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0421_text_document +0.00015841934874861877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0422_text_document +0.000152377097357806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0423_text_document +0.00014942797865989248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0424_text_document +0.00015640838403734855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0425_text_document +0.0001557305888039896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0426_text_document +0.00014992907934376868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0427_text_document +0.00015847297170019638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0428_text_document +0.0001563057066889321 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0429_text_document +0.00015425884830587555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0430_text_document +0.00015294599138593887 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0431_text_document +0.00015307387809393826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0432_text_document +0.00016021533866175615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0433_text_document +0.00015819924688246454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0434_text_document +0.00014854336050366525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0435_text_document +0.00015428039783626384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0436_text_document +0.00015380539006369472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0437_text_document +0.00015543551510602353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0438_text_document +0.00015792640857808265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0439_text_document +0.00015591945366146652 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0440_text_document +0.00014809559672766608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0441_text_document +0.00015190843215388426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0442_text_document +0.00014890757113683386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0443_text_document +0.0001610286090290533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0444_text_document +0.00015061787553649923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0445_text_document +0.00014811603935037767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0446_text_document +0.00015254163073097444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0447_text_document +0.00015300211863900935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0448_text_document +0.00015063192030688013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0449_text_document +0.00015300622789493292 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0450_text_document +0.00015096280425750327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0451_text_document +0.00015205454100558358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0452_text_document +0.00015121161958027361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0453_text_document +0.0001493611157597698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0454_text_document +0.00015838957873196607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0455_text_document +0.0001497669779590609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0456_text_document +0.00015173657097785533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0457_text_document +0.0001542516903028995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0458_text_document +0.000149139532833868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0459_text_document +0.00014644441551246194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0460_text_document +0.00015166787754612994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0461_text_document +0.00014923555170687534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0462_text_document +0.00015589324574035403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0463_text_document +0.00015022803227804745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0464_text_document +0.00015127324533861265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0465_text_document +0.00014783676790095657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0466_text_document +0.00014927753645591052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0467_text_document +0.00014753911610765252 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0468_text_document +0.00014886425094132403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0469_text_document +0.00014432622711023067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0470_text_document +0.00015087353567030766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0471_text_document +0.00015318739523991737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0472_text_document +0.00014716603935377532 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0473_text_document +0.00015032310787320853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0474_text_document +0.00014425315738264723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0475_text_document +0.0001507311940067415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0476_text_document +0.0001735562949386336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0477_text_document +0.0001664225151007615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0478_text_document +0.00017016223341338198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0479_text_document +0.0001686337558140661 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0480_text_document +0.00018737654520115072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0481_text_document +0.00016696818282464752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0482_text_document +0.00017542891864931188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0483_text_document +0.000168925038877694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0484_text_document +0.0001769097096293462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0485_text_document +0.00017465563985682533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0486_text_document +0.0001704723163845607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0487_text_document +0.00017113194080906855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0488_text_document +0.00017056770492485763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0489_text_document +0.0001736825492971628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0490_text_document +0.00017060994856935613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0491_text_document +0.00017539355807018588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0492_text_document +0.00017512560274649157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0493_text_document +0.00017536288179601056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0494_text_document +0.00017214679473623093 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0495_text_document +0.00017372473469635212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0496_text_document +0.00016968876198424372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0497_text_document +0.00017328658337078598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0498_text_document +0.00016545006523949998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0499_text_document +0.0001712623636560391 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0500_text_document +0.00017259544872761246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0501_text_document +0.00016731532955664165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0502_text_document +0.00017234554920296389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0503_text_document +0.00016824263782247044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0504_text_document +0.00017046154865322805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0505_text_document +0.00016701775451880861 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0506_text_document +0.0001640723558698162 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0507_text_document +0.00016912021224512063 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0508_text_document +0.00016148128416798815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0509_text_document +0.00017033021559990035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0510_text_document +0.00016742449903581303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0511_text_document +0.00016604941440707502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0512_text_document +0.00016168218680070063 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0513_text_document +0.00016545734985198287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0514_text_document +0.00016617264790719555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0515_text_document +0.00016903898126379064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0516_text_document +0.00016251470403425602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0517_text_document +0.00016741321573477316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0518_text_document +0.00016314387702135404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0519_text_document +0.00016261766224352778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0520_text_document +0.00016043765927930694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0521_text_document +0.0001581188444159775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0522_text_document +0.0001675593630876091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0523_text_document +0.00016225811098829194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0524_text_document +0.00016027854790273813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0525_text_document +0.00015477514040295668 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0526_text_document +0.00016132027735084922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0527_text_document +0.00016144543812901825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0528_text_document +0.00016356924967160763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0529_text_document +0.00016721507926064277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0530_text_document +0.0001623283758093546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0531_text_document +0.00016540060361910116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0532_text_document +0.00016618517731232895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0533_text_document +0.0001661140965633334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0534_text_document +0.00016521134906101744 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0535_text_document +0.0001605250452596446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0536_text_document +0.00016158626615495202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0537_text_document +0.00016348402666537893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0538_text_document +0.00015887094758334445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0539_text_document +0.00016216761850919694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0540_text_document +0.00016125922688833952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0541_text_document +0.00015719662175540762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0542_text_document +0.00016177908132776304 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0543_text_document +0.0001616654955707841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0544_text_document +0.0001575744247706023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0545_text_document +0.00016594502227726776 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0546_text_document +0.00016680360478028852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0547_text_document +0.00016969508752354227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0548_text_document +0.00018702211879271686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0549_text_document +0.00019358085009705273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0550_text_document +0.0001871367897387826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0551_text_document +0.00018452058370522755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0552_text_document +0.0001850164319455863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0553_text_document +0.00018589455402222413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0554_text_document +0.00018848818876445855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0555_text_document +0.00018677441309244695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0556_text_document +0.00018806266359047162 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0557_text_document +0.00018742615490284408 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0558_text_document +0.00018308658912909244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0559_text_document +0.00017917024956722993 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0560_text_document +0.0001796815083811096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0561_text_document +0.00018830762534435366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0562_text_document +0.0001850705756497164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0563_text_document +0.00018620607609678367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0564_text_document +0.00018735293561315315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0565_text_document +0.00018406055855123805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0566_text_document +0.00018296049025592247 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0567_text_document +0.00018407127494772196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0568_text_document +0.0001809459590066732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0569_text_document +0.00018206921683271417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0570_text_document +0.0001823423927624476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0571_text_document +0.00017843504198196598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0572_text_document +0.0001849074668186014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0573_text_document +0.0001812163144813499 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0574_text_document +0.00018309068999374263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0575_text_document +0.00018500613289155086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0576_text_document +0.00017930403632760822 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0577_text_document +0.0001846380543749688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0578_text_document +0.0001805411790348431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0579_text_document +0.00017815258406988848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0580_text_document +0.00017771149209661494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0581_text_document +0.000179212119800064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0582_text_document +0.0001770710081666354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0583_text_document +0.00018076802304233783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0584_text_document +0.00018266780486243524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0585_text_document +0.00017952537023013302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0586_text_document +0.00017482592939671484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0587_text_document +0.00017479307237867526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0588_text_document +0.00017947982239834899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0589_text_document +0.00017800230944457152 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0590_text_document +0.0001768045667273756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0591_text_document +0.00018432659029891628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0592_text_document +0.00017860310980883306 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0593_text_document +0.00017352563618741148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0594_text_document +0.000177967402241009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0595_text_document +0.0001761394507080597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0596_text_document +0.0001727461411889822 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0597_text_document +0.00017520765607261058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0598_text_document +0.00017389963918978602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0599_text_document +0.00017297383567671195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0600_text_document +0.00017186248654837811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0601_text_document +0.00018016764298215066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0602_text_document +0.00017252933018279703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0603_text_document +0.0001720498259217191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0604_text_document +0.00017208910794032673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0605_text_document +0.0001638288329725128 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0606_text_document +0.00015774370365565657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0607_text_document +0.00015428183891406193 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0608_text_document +0.0001579263490987627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0609_text_document +0.00015679781661701012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0610_text_document +0.00015686067490532405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0611_text_document +0.00015476043642401294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0612_text_document +0.0001538144005636655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0613_text_document +0.00015471809257783847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0614_text_document +0.00014950254548936378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0615_text_document +0.00015189343275275787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0616_text_document +0.00016808135779534307 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0617_text_document +0.00015331380459020154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0618_text_document +0.00015025506525877266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0619_text_document +0.00015705079524537657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0620_text_document +0.00014843144411648014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0621_text_document +0.0001536670204340525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0622_text_document +0.00014701650982417206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0623_text_document +0.0001470830903826265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0624_text_document +0.00014669457615379322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0625_text_document +0.00015327731341039172 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0626_text_document +0.00016421071093813112 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0627_text_document +0.00014320086554259857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0628_text_document +0.00014733292080267092 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0629_text_document +0.00014574339323444963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0630_text_document +0.00014508510524362508 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0631_text_document +0.0001510667294376284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0632_text_document +0.00014448955337404646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0633_text_document +0.00015189242851477872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0634_text_document +0.0001408976680729981 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0635_text_document +0.00014495438771487836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0636_text_document +0.00014607129482780071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0637_text_document +0.0001425703250247454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0638_text_document +0.00014772556798043487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0639_text_document +0.0001454755294743558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0640_text_document +0.00014604759342940054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0641_text_document +0.000144987966876031 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0642_text_document +0.00014159362399631978 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0643_text_document +0.00015166107543186514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0644_text_document +0.00013872638536941069 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0645_text_document +0.00014392691133816916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0646_text_document +0.00014527538230304764 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0647_text_document +0.0001445241296159157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0648_text_document +0.00014566980102669863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0649_text_document +0.00014105957349679274 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0650_text_document +0.00014407711883329926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0651_text_document +0.00014304333666146412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0652_text_document +0.00014480474786471068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0653_text_document +0.00014513562095603888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0654_text_document +0.00014216954843071324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0655_text_document +0.0001472056417215835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0656_text_document +0.0001411732545194045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0657_text_document +0.00014472737242668624 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0658_text_document +0.0001412212585262607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0659_text_document +0.00020834639482623596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0660_text_document +0.00019484913874296875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0661_text_document +0.00019400182473285833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0662_text_document +0.000192581173021768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0663_text_document +0.0001958163408499538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0664_text_document +0.00019017201894348343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0665_text_document +0.00018748712836308062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0666_text_document +0.00019398325978096153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0667_text_document +0.00018740362852951608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0668_text_document +0.00018769931256921782 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0669_text_document +0.00018841740417805205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0670_text_document +0.0001897879160564146 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0671_text_document +0.00018663113185306689 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0672_text_document +0.00018894652949372258 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0673_text_document +0.0001929378648272062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0674_text_document +0.00019134942047365448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0675_text_document +0.00018699153383533985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0676_text_document +0.00018610331853766602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0677_text_document +0.0001863160274451902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0678_text_document +0.00018636405144302115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0679_text_document +0.00018489348621678148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0680_text_document +0.0001860176372198307 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0681_text_document +0.00018315031813541827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0682_text_document +0.00019049993633217256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0683_text_document +0.00018374255446481207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0684_text_document +0.00017918235151102646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0685_text_document +0.00018078078222027994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0686_text_document +0.00018377134048126254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0687_text_document +0.00018119048712916442 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0688_text_document +0.00018226290667237163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0689_text_document +0.00018539016766122422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0690_text_document +0.00018304864675259609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0691_text_document +0.00018006283819913595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0692_text_document +0.00017853375396011673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0693_text_document +0.0001806080666151815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0694_text_document +0.00018287085590792935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0695_text_document +0.00018102703894508278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0696_text_document +0.00017985249563069855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0697_text_document +0.00018055111208127884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0698_text_document +0.00017436715651687287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0699_text_document +0.0001750410902836745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0700_text_document +0.0001755658852086883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0701_text_document +0.00017704710809249836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0702_text_document +0.00017563712144304312 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0703_text_document +0.00017646118668991032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0704_text_document +0.0001738273848965312 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0705_text_document +0.00017355052248297015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0706_text_document +0.00017182494917422235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0707_text_document +0.0001796801127149085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0708_text_document +0.0001535678074475219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0709_text_document +0.00016509131806569352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0710_text_document +0.0001660762988129014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0711_text_document +0.00017181117317139103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0712_text_document +0.00016385189811495075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0713_text_document +0.00016321938466065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0714_text_document +0.0001627668114510062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0715_text_document +0.0001667874841569603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0716_text_document +0.0001647336272051215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0717_text_document +0.00015927038206724374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0718_text_document +0.000163069807004626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0719_text_document +0.00016643362662749963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0720_text_document +0.0001598347201275479 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0721_text_document +0.00016414824852047793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0722_text_document +0.00016387374849716915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0723_text_document +0.00016218986007283508 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0724_text_document +0.00016170100645242406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0725_text_document +0.00016794279442600715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0726_text_document +0.00016410407241508566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0727_text_document +0.00016663924614304762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0728_text_document +0.0001610334643678992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0729_text_document +0.00016082817926927476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0730_text_document +0.00016483710320531984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0731_text_document +0.00015950564573034403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0732_text_document +0.00016176598872010603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0733_text_document +0.00016374799045777884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0734_text_document +0.00016207070843359862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0735_text_document +0.000161310121195263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0736_text_document +0.0001590930806312555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0737_text_document +0.00015872700071854542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0738_text_document +0.0001601426608559989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0739_text_document +0.0001592737504230903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0740_text_document +0.0001599609389465664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0741_text_document +0.0001573951015313392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0742_text_document +0.00015918138446881715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0743_text_document +0.00016063409035052854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0744_text_document +0.00015479247307168076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0745_text_document +0.0001590206266750552 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0746_text_document +0.00016413616409963463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0747_text_document +0.00015909403254717725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0748_text_document +0.00015912638065916792 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0749_text_document +0.00015509170291798033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0750_text_document +0.00015668221053756931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0751_text_document +0.00015993661313870757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0752_text_document +0.00015986553041529475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0753_text_document +0.0001551253906720823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0754_text_document +0.0001569044427999477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0755_text_document +0.00015512319487328638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0756_text_document +0.00016021882869106635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0757_text_document +0.00015415106017838012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0758_text_document +0.00015711650631982987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0759_text_document +0.00015512670736159294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0760_text_document +0.00016200410442893923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0761_text_document +0.00015949285619573655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0762_text_document +0.0001625616727060612 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0763_text_document +0.00016316486655764686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0764_text_document +0.0001571167311565954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0765_text_document +0.00016128213234978153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0766_text_document +0.00015535324730882956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0767_text_document +0.0001579934311592013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0768_text_document +0.00015195311864613838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0769_text_document +0.0001615190125670139 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0770_text_document +0.00015867133202388371 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0771_text_document +0.00015932910049616658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0772_text_document +0.00015735730575532447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0773_text_document +0.00016192787415292593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0774_text_document +0.00015443514945271916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0775_text_document +0.00015290872574095856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0776_text_document +0.0001586657525675075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0777_text_document +0.0001561292345081933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0778_text_document +0.0001584146414910674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0779_text_document +0.00015282231142071527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0780_text_document +0.0001561252202711004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0781_text_document +0.00015508367049496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0782_text_document +0.00015211947613405347 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0783_text_document +0.00014976529550875275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0784_text_document +0.00015418186133444713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0785_text_document +0.00015777360151582686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0786_text_document +0.000152640262498424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0787_text_document +0.00015418142572863903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0788_text_document +0.00015502601134089746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0789_text_document +0.00015405733434421877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0790_text_document +0.00015484459497253604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0791_text_document +0.0001541867208689297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0792_text_document +0.00015014404352940876 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0793_text_document +0.00015357544967633106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0794_text_document +0.00015037823631794736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0795_text_document +0.00015025795679285078 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0796_text_document +0.00014876992710553488 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0797_text_document +0.00015032669711698612 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0798_text_document +0.00015596697517010466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0799_text_document +0.00015498394440674378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0800_text_document +0.00014757314272111684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0801_text_document +0.00014919071614611802 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0802_text_document +0.00014686280514246915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0803_text_document +0.00015882771228777683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0804_text_document +0.00014763597756322578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0805_text_document +0.00014785441795725526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0806_text_document +0.00015313024795352964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0807_text_document +0.0001497627986113246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0808_text_document +0.00014499607432690722 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0809_text_document +0.0001461719027401259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0810_text_document +0.00014839933441537366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0811_text_document +0.0001475840995029022 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0812_text_document +0.00015065512711375653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0813_text_document +0.00015285087358760883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0814_text_document +0.00014861957547794477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0815_text_document +0.00014996949492468605 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0816_text_document +0.0001472998668365096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0817_text_document +0.0001464012147691964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0818_text_document +0.00015227635617231567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0819_text_document +0.0001491494017117428 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0820_text_document +0.00014464475787246092 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0821_text_document +0.00014410767861685618 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0822_text_document +0.000144919516791233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0823_text_document +0.00014507990635617585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0824_text_document +0.0001468797342896656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0825_text_document +0.0001422000420712919 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0826_text_document +0.00014228987139298954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0827_text_document +0.00014481016912090385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0828_text_document +0.000142802473797815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0829_text_document +0.00014812295450003065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0830_text_document +0.00014697991622146685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0831_text_document +0.000143946325289488 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0832_text_document +0.0001418544716646782 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0833_text_document +0.00014706985092768576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0834_text_document +0.0001411487598988699 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0835_text_document +0.0001583983550166893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0836_text_document +0.00015370277071378533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0837_text_document +0.0001574284524004961 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0838_text_document +0.00016033599900258183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0839_text_document +0.00016159470012508268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0840_text_document +0.00015624921021983388 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0841_text_document +0.0001603288323615303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0842_text_document +0.00016421653645625842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0843_text_document +0.00016136751182857813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0844_text_document +0.0001644008542307843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0845_text_document +0.00016320230298972016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0846_text_document +0.00016176830866038722 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0847_text_document +0.00015883945834286212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0848_text_document +0.00015854734059433728 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0849_text_document +0.00015424048326372636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0850_text_document +0.00015913631543321879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0851_text_document +0.00016242367155204024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0852_text_document +0.00016352898883564303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0853_text_document +0.00016283852574114027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0854_text_document +0.0001597064012689706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0855_text_document +0.00015723207463854053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0856_text_document +0.00016082454091186785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0857_text_document +0.00015148430437371348 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0858_text_document +0.00015699196205345046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0859_text_document +0.00016323993834433252 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0860_text_document +0.00015419189482936103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0861_text_document +0.00014984592429281824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0862_text_document +0.0001540327550705441 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0863_text_document +0.00015559458082419316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0864_text_document +0.00015809601003355687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0865_text_document +0.00015561437781246056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0866_text_document +0.00015650965510707114 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0867_text_document +0.00015654223175785975 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0868_text_document +0.00015966194232830576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0869_text_document +0.0001542791440813034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0870_text_document +0.00016358133853488976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0871_text_document +0.0001610108148402946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0872_text_document +0.0001567861463301872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0873_text_document +0.00015916579076809533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0874_text_document +0.00015834187212170972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0875_text_document +0.00015492852942470005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0876_text_document +0.0001565761307746086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0877_text_document +0.00016111787860345758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0878_text_document +0.00015262185821473176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0879_text_document +0.00015609313599061615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0880_text_document +0.00015265109415151545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0881_text_document +0.00015596676711588585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0882_text_document +0.00015602244000618423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0883_text_document +0.00015533087814847594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0884_text_document +0.000148761688602713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0885_text_document +0.00015124065708812265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0886_text_document +0.00015177148904071277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0887_text_document +0.00015551510213818192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0888_text_document +0.00015328016792414618 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0889_text_document +0.00014826652573194586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0890_text_document +0.00015618973632950672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0891_text_document +0.00016465597460827412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0892_text_document +0.00017729797829003265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0893_text_document +0.00017645710877786075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0894_text_document +0.000173993320599559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0895_text_document +0.0001752697954262395 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0896_text_document +0.00017545831920313468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0897_text_document +0.00017512052874093406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0898_text_document +0.00017596295211949001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0899_text_document +0.0001763343681416489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0900_text_document +0.00016737628055788186 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0901_text_document +0.00017659674006013248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0902_text_document +0.00017521085067973818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0903_text_document +0.00018110203496350606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0904_text_document +0.00016887408015540739 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0905_text_document +0.0001730418383091983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0906_text_document +0.00017084812178309202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0907_text_document +0.00016928946570955264 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0908_text_document +0.00017272373105947043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0909_text_document +0.00016793546933797045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0910_text_document +0.00016510473373737477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0911_text_document +0.0001656625036518595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0912_text_document +0.00016849674877913583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0913_text_document +0.00017492155042464418 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0914_text_document +0.00017092357710033054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0915_text_document +0.00016970730743877006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0916_text_document +0.00016573665091766286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0917_text_document +0.00016358480536479716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0918_text_document +0.0001653802811890403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0919_text_document +0.00017231807148475074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0920_text_document +0.00017361608596973323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0921_text_document +0.00017404933358323055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0922_text_document +0.00016371945617952907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0923_text_document +0.00017000836658266155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0924_text_document +0.00017142976487027857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0925_text_document +0.00017006281434704977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0926_text_document +0.0001751965302313473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0927_text_document +0.00016954848753554936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0928_text_document +0.0001683555446267139 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0929_text_document +0.00016921278107076727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0930_text_document +0.00016808682594394623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0931_text_document +0.00017711704047105475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0932_text_document +0.0001675247295876393 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0933_text_document +0.00017061773073498863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0934_text_document +0.0001644856648306077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0935_text_document +0.00016530682645009105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0936_text_document +0.00016993430076157017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0937_text_document +0.00016716870217360928 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0938_text_document +0.0001672477045314564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0939_text_document +0.00016150529456268964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0940_text_document +0.0001642955368396883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0941_text_document +0.0001650135010986092 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0942_text_document +0.0001719916971031507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0943_text_document +0.0001663860254017646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0944_text_document +0.00016810785027934324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0945_text_document +0.00016663511368772123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0946_text_document +0.00017120237493641126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0947_text_document +0.0001651698100366788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0948_text_document +0.00016069571413445028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0949_text_document +0.0001631772602215936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0950_text_document +0.00016994484266892867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0951_text_document +0.00016821930169126347 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0952_text_document +0.0001680542144940534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0953_text_document +0.00015807234911071054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0954_text_document +0.00016287290799651364 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0955_text_document +0.00016674360421415713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0956_text_document +0.0001663549971877126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0957_text_document +0.0001699417467826641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0958_text_document +0.0001661066433849769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0959_text_document +0.00016736976350010906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0960_text_document +0.00016160049405253383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0961_text_document +0.0001625500850979611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0962_text_document +0.00016172349111618741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0963_text_document +0.00016041582790085466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0964_text_document +0.00016369413378455798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0965_text_document +0.00016245798272839223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0966_text_document +0.00016458727969573578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0967_text_document +0.0001618972714257936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0968_text_document +0.00016149423535800886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0969_text_document +0.00015886933917368354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0970_text_document +0.00015721961433801126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0971_text_document +0.00015609496997744904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0972_text_document +0.0001608435755282705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0973_text_document +0.00015730100598754584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0974_text_document +0.00015955845719642757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0975_text_document +0.00015469663090901824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0976_text_document +0.00015812452037199733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0977_text_document +0.00015443940925795885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0978_text_document +0.00015678701926941855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0979_text_document +0.00015787925332384637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0980_text_document +0.00015669644312439214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0981_text_document +0.00015342587917756964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0982_text_document +0.00015642024238741553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0983_text_document +0.0001540823378708023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0984_text_document +0.00015238224416999995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0985_text_document +0.0001522695061784323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0986_text_document +0.00020085620305657233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0987_text_document +0.00014698197479826313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0988_text_document +0.00014796924883111914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0989_text_document +0.0001483800966807953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0990_text_document +0.00014550940307048242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0991_text_document +0.00015052597307667803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0992_text_document +0.00014866583878918362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0993_text_document +0.00014440801314961302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0994_text_document +0.00014295564464645108 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0995_text_document +0.00014903049761507035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0996_text_document +0.00014820091066353183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0997_text_document +0.0001429454882440627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0998_text_document +0.00015048550764172483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0999_text_document +0.0001430543312039796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1000_text_document +0.00014661342883839465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1001_text_document +0.00014721354013103223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1002_text_document +0.00014780017824708586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1003_text_document +0.0001463184859455721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1004_text_document +0.00014654870719379106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1005_text_document +0.00020943212095457075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1006_text_document +0.00021205821955900777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1007_text_document +0.00014176730212983274 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1008_text_document +0.00014026276433980122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1009_text_document +0.00013570196535880505 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1010_text_document +0.00014776685378575983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1011_text_document +0.00014138218982193943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1012_text_document +0.0001412602382122253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1013_text_document +0.00013944232659104602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1014_text_document +0.00014570617769030735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1015_text_document +0.00014233071172042007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1016_text_document +0.00014016762901851798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1017_text_document +0.0001434413757259645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1018_text_document +0.00014003324697133565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1019_text_document +0.00014567282904236987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1020_text_document +0.00013992559507863123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1021_text_document +0.00021096883039305026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1022_text_document +0.00014274603730164107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1023_text_document +0.00013914595792215918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1024_text_document +0.00013666688380542608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1025_text_document +0.00014001152690065646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1026_text_document +0.00021392615254787925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1027_text_document +0.00014251166508793392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1028_text_document +0.00013886942449587415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1029_text_document +0.0002078004025575127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1030_text_document +0.00020928673622040174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1031_text_document +0.00020558733131260538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1032_text_document +0.0002036663760886078 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1033_text_document +0.00014592860566679667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1034_text_document +0.00014346325128200297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1035_text_document +0.00014068142446497316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1036_text_document +0.000142996292961803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1037_text_document +0.00020633185839414136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1038_text_document +0.00013684538988274547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1039_text_document +0.0002033768324865864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1040_text_document +0.000200593087523188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1041_text_document +0.0002297294147093001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1042_text_document +0.00022971372080690233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1043_text_document +0.00023092966691083417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1044_text_document +0.00015159247973379415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1045_text_document +0.00015257723761865372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1046_text_document +0.00015750287090187065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1047_text_document +0.00015557071949799488 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1048_text_document +0.00015138603787345713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1049_text_document +0.00014966823068820163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1050_text_document +0.00015481393029806212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1051_text_document +0.0001521335747073047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1052_text_document +0.00015447866363472483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1053_text_document +0.0001564823000495303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1054_text_document +0.00015484698673224505 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1055_text_document +0.00022305811126444646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1056_text_document +0.00015308102523761935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1057_text_document +0.00022494528198789627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1058_text_document +0.0002206911435725598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1059_text_document +0.00021440132246946592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1060_text_document +0.00014934935094772055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1061_text_document +0.00015275047150828305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1062_text_document +0.00021692931968428998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1063_text_document +0.00023057843831795596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1064_text_document +0.00022061661869945533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1065_text_document +0.0001475889972917192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1066_text_document +0.00014965255899799802 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1067_text_document +0.000146325773766483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1068_text_document +0.00021849119850040293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1069_text_document +0.00021649545481859658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1070_text_document +0.00014463616989778393 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1071_text_document +0.00014301572221485565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1072_text_document +0.00014804643324427358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1073_text_document +0.0002143783669071859 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1074_text_document +0.0001479303814401362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1075_text_document +0.00015068744684349907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1076_text_document +0.00021658806091136903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1077_text_document +0.00021333945668012075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1078_text_document +0.000142221472149436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1079_text_document +0.0002158096794842747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1080_text_document +0.00021541031163695796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1081_text_document +0.0002160301031804424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1082_text_document +0.00014484879119054217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1083_text_document +0.00014717950537309672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1084_text_document +0.00021016132927298846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1085_text_document +0.00021433713539833563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1086_text_document +0.0001438233936284062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1087_text_document +0.0001447086593934949 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1088_text_document +0.00021440017582664183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1089_text_document +0.00020841624205804798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1090_text_document +0.000213227136771408 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1091_text_document +0.00020931414236598925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1092_text_document +0.0002134545412666026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1093_text_document +0.0002126803251195216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1094_text_document +0.00014114550507201583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1095_text_document +0.00016444080384922814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1096_text_document +0.0001542515002652382 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1097_text_document +0.0001608177523717217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1098_text_document +0.0001577693965006662 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1099_text_document +0.0001615213258436368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1100_text_document +0.00014975169893108998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1101_text_document +0.00015902857074290308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1102_text_document +0.00015523901418979132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1103_text_document +0.00015842052994374488 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1104_text_document +0.0001543439686424067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1105_text_document +0.0001559141331005536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1106_text_document +0.0001558557495821586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1107_text_document +0.00016108187362389814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1108_text_document +0.0001605357063724452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1109_text_document +0.0001588416921491903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1110_text_document +0.00015452564563384654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1111_text_document +0.0001575925464658241 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1112_text_document +0.000155416389913229 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1113_text_document +0.00015834897089216795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1114_text_document +0.00015376802717866433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1115_text_document +0.00015257616131444455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1116_text_document +0.00015333466381495513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1117_text_document +0.00015356006723825613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1118_text_document +0.00015392513748333956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1119_text_document +0.00015808193589371923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1120_text_document +0.00015572715307115401 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1121_text_document +0.00015677288071421776 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1123_text_document +0.00015564703516755468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1124_text_document +0.00015473730933423342 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1125_text_document +0.00015227152970932222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1126_text_document +0.00015062363935408713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1127_text_document +0.0001608838990519831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1128_text_document +0.00016058746991656767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1129_text_document +0.00015232158785053588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1130_text_document +0.00015216796930278597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1131_text_document +0.00015531087359959403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1132_text_document +0.00017455174602057423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1133_text_document +0.00015220395996782025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1134_text_document +0.00022536045257736233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1135_text_document +0.00023391977994072452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1136_text_document +0.00022316737354122904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1137_text_document +0.00023097409031198833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1138_text_document +0.0001536444602488289 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1139_text_document +0.00015290857223001657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1140_text_document +0.00015053717764782956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1141_text_document +0.0001487906308449292 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1142_text_document +0.00022796481136694752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1143_text_document +0.00022388054021300896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1144_text_document +0.00015633876287631285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1146_text_document +0.00015683128496399404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1147_text_document +0.0001498588984354396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1148_text_document +0.00015466674094651695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1149_text_document +0.00015104328866230663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1150_text_document +0.0001510288850415886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1151_text_document +0.00015453329995596143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1152_text_document +0.0001717890160140908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1153_text_document +0.00016303689223488152 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1154_text_document +0.00017438742884609578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1155_text_document +0.00017195307231868866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1156_text_document +0.00016630614911747752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1157_text_document +0.0001738954845222655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1158_text_document +0.00016759158755171884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1159_text_document +0.00017061259922452842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1160_text_document +0.00017196072417278202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1161_text_document +0.00016824585118656202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1162_text_document +0.00016301309236242047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1163_text_document +0.0001718575393991296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1164_text_document +0.00017003663826341565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1165_text_document +0.00017018328983305946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1166_text_document +0.00017218141861091656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1167_text_document +0.00016559619112054818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1168_text_document +0.00016284882257395627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1169_text_document +0.0001617104078870124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1170_text_document +0.00016849349395228177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1171_text_document +0.00016378319727916067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1172_text_document +0.00017114019486042634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1173_text_document +0.0001726823065513329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1174_text_document +0.00016244897469644304 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1175_text_document +0.0001613681046473606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1176_text_document +0.00018118661924575096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1177_text_document +0.00016563345750593493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1178_text_document +0.00016790014898759615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1179_text_document +0.0001629142142864177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1180_text_document +0.00016191717527939525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1181_text_document +0.0001671004065869619 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1182_text_document +0.0001675370141650324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1183_text_document +0.00016799445480682778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1184_text_document +0.0001719736620354862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1185_text_document +0.00016261057260474936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1186_text_document +0.00015865991174764644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1187_text_document +0.00015739800441831657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1188_text_document +0.00016171134746282626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1189_text_document +0.00016720238820009615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1190_text_document +0.00016497201020069133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1191_text_document +0.00016081080933342493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1192_text_document +0.0001598451415954535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1193_text_document +0.00016189725587725768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1194_text_document +0.00015376149407875128 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1195_text_document +0.00015923032632387212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1196_text_document +0.000161420662154024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1197_text_document +0.00015926844960634996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1198_text_document +0.000156372807999939 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1199_text_document +0.00016050285429044874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1200_text_document +0.00015617925982472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1201_text_document +0.00016514079794945202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1202_text_document +0.00016522274070820443 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1203_text_document +0.0001597381170738336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1204_text_document +0.0001616744058690789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1205_text_document +0.00016029435854255644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1206_text_document +0.0001600416279503584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1207_text_document +0.0001607379715998696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1208_text_document +0.0001593514911283079 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1209_text_document +0.00015864317782095664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1210_text_document +0.00015911735436385907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1211_text_document +0.0001556275795066712 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1212_text_document +0.0001656764173702947 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1213_text_document +0.00015679155524627255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1214_text_document +0.00016376988600479205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1215_text_document +0.0001581538165285075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1216_text_document +0.0001610240227045592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1217_text_document +0.00015776131940645536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1218_text_document +0.00015818231748846595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1219_text_document +0.0001625550897521123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1220_text_document +0.0001547371099180901 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1221_text_document +0.00015414283944531357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1222_text_document +0.00016266088273096592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1223_text_document +0.00016083169545961368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1224_text_document +0.0001573027086756309 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1225_text_document +0.00015728313997935927 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1226_text_document +0.00016781226249248295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1227_text_document +0.00014976228995207784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1228_text_document +0.00015444629923379175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1229_text_document +0.00015203154472094758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1230_text_document +0.00015416974359531256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1231_text_document +0.00015545110214308707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1232_text_document +0.0001510309557116906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1233_text_document +0.000150151986610048 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1234_text_document +0.00014833490597173326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1235_text_document +0.00014730918386476007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1236_text_document +0.00014903663558472915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1237_text_document +0.00014834903218682616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1238_text_document +0.00015322537809196756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1239_text_document +0.0001511230642513134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1240_text_document +0.00015357591909403477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1241_text_document +0.00015295542934724653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1242_text_document +0.00015013958035919124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1243_text_document +0.00015023610122778707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1244_text_document +0.00014784318253583398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1245_text_document +0.00015065966876706016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1246_text_document +0.0001481405433493943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1247_text_document +0.00014721741369089534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1248_text_document +0.00014730057861393202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1249_text_document +0.00015235999841072513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1250_text_document +0.00014541040677624616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1251_text_document +0.00014639042630648248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1252_text_document +0.00015068532335773535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1253_text_document +0.00015516053357170532 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1254_text_document +0.00014515004876336832 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1255_text_document +0.0001488593805475465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1256_text_document +0.0001506759742452044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1257_text_document +0.0001429840653957083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1258_text_document +0.00014437998012654534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1259_text_document +0.0001428860592717282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1260_text_document +0.0001475220383855572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1261_text_document +0.00014640582972274082 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1262_text_document +0.0001505350968588391 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1263_text_document +0.00014784485165882563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1264_text_document +0.00014770697193146622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1265_text_document +0.0001433464625266231 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1266_text_document +0.00014139730694769496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1267_text_document +0.00014139435371307747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1268_text_document +0.00014164383589527758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1269_text_document +0.0001429075740030123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1270_text_document +0.00014605872692153072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1271_text_document +0.0001424796215298057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1272_text_document +0.00014112515203848743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1273_text_document +0.00014039188160335826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1274_text_document +0.00014502736267043328 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1275_text_document +0.00014184146815260007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1276_text_document +0.0001453216584479987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1277_text_document +0.00014226985746562565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1278_text_document +0.00013903471234323833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1279_text_document +0.00014633669945119654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1280_text_document +0.00015567823959834718 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1281_text_document +0.00016711998145328748 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1282_text_document +0.00016716820782888765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1283_text_document +0.00016788189624042867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1284_text_document +0.00016762149528397544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1285_text_document +0.00016394982452183396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1286_text_document +0.00017499487929449305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1287_text_document +0.00017285598246362648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1288_text_document +0.0001813127546456402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1289_text_document +0.00016923644001919636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1290_text_document +0.00016671545149204298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1291_text_document +0.0001691584149978932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1292_text_document +0.00016279240063910965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1293_text_document +0.00016581675179191334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1294_text_document +0.00016709742151486606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1295_text_document +0.00016462921631835026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1296_text_document +0.0001635773235573904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1297_text_document +0.0001629499633321397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1298_text_document +0.00016244603775076793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1299_text_document +0.00016565874682941692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1300_text_document +0.00016704769334813707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1301_text_document +0.00016527793060668047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1302_text_document +0.0001614670182628741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1303_text_document +0.00016090321773766912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1304_text_document +0.00016205158644923216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1305_text_document +0.00016115649647745916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1306_text_document +0.00016750884342636079 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1307_text_document +0.0001593023982303325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1308_text_document +0.00015894512446540672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1309_text_document +0.00016391499925658774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1310_text_document +0.0001615310219600013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1311_text_document +0.00016109142610140696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1312_text_document +0.0001622135071747606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1313_text_document +0.00016686311075489617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1314_text_document +0.00016322992039795453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1315_text_document +0.00015923727775344227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1316_text_document +0.00016528070219491 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1317_text_document +0.00016089805290891765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1318_text_document +0.00016142731643379644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1319_text_document +0.00016164621217780662 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1320_text_document +0.00015738061325748116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1321_text_document +0.0001591233926254462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1322_text_document +0.00016649327648776514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1323_text_document +0.00016299925243783037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1324_text_document +0.00016490993699004063 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1325_text_document +0.0001589061309585213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1326_text_document +0.00015701373074415468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1327_text_document +0.00015755460137450403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1328_text_document +0.00016368403834230255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1329_text_document +0.0001619141257919363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1330_text_document +0.0002274793692927606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1331_text_document +0.0001567633247814788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1332_text_document +0.00022905033511751312 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1333_text_document +0.0001548301064518758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1334_text_document +0.000226605319945327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1335_text_document +0.00022667037674726058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1336_text_document +0.00022923961805784498 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1337_text_document +0.00014906828549341607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1338_text_document +0.00015829222539969273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1339_text_document +0.0001509036911919305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1340_text_document +0.00022536653378252486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1341_text_document +0.00015104016760222197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1342_text_document +0.00015099364342110257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1343_text_document +0.00022777331115603203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1344_text_document +0.00021580582739619934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1345_text_document +0.0001492017484493636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1346_text_document +0.0002232038326367584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1347_text_document +0.0002173110715340058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1348_text_document +0.0002106853410947563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1349_text_document +0.00021523392953900664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1350_text_document +0.00021996424976477582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1351_text_document +0.00021735745725911482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1352_text_document +0.00014743618479981591 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1353_text_document +0.00021587099328468655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1354_text_document +0.00021669175360386172 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1355_text_document +0.00021667379282364665 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1356_text_document +0.0002192120523189847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1357_text_document +0.00021547193097844086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1358_text_document +0.00021621049112421326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1359_text_document +0.00021196265801039842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1360_text_document +0.00021115416894129982 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1361_text_document +0.00021548122875612305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1362_text_document +0.0002167839127379268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1363_text_document +0.00021388435981092266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1364_text_document +0.00021247309275187394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1365_text_document +0.00020865156988970925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1366_text_document +0.00021232420243985875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1367_text_document +0.00020288941772275403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1368_text_document +0.00020534370920083462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1369_text_document +0.00014906807620518648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1370_text_document +0.0002110153701227056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1371_text_document +0.00020709542453451886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1372_text_document +0.00020465988557797482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1373_text_document +0.000195974694790701 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1374_text_document +0.0002006410964660873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1375_text_document +0.00020083864604468702 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1376_text_document +0.00020640909562295756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1377_text_document +0.0002009390668809768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1378_text_document +0.00019660322090934407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1379_text_document +0.0002031382964736789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1380_text_document +0.00019629671755665872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1381_text_document +0.00019754174238439996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1382_text_document +0.0002056909946356413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1383_text_document +0.0001979138566098626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1384_text_document +0.0001932131948461709 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1385_text_document +0.00020416546879013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1386_text_document +0.00020460391232945065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1387_text_document +0.00019389888059130955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1388_text_document +0.00019783854863351214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1389_text_document +0.000200961415063147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1390_text_document +0.0001956818423121531 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1391_text_document +0.00020637040765714317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1392_text_document +0.00020119793791085526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1393_text_document +0.0002019159752232148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1394_text_document +0.00020709690510066213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1395_text_document +0.00019733093804912572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1396_text_document +0.0001880608678579731 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1397_text_document +0.0002016375431479316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1398_text_document +0.00019179791527764437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1399_text_document +0.00018506553224762644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1400_text_document +0.00019958850500821938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1401_text_document +0.0001963985599733761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1402_text_document +0.00019686962952391687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1403_text_document +0.00019466431453041557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1404_text_document +0.00019423474723069192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1405_text_document +0.00018645004940802463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1406_text_document +0.0001957563417646353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1407_text_document +0.00019567310057973193 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1408_text_document +0.00019820964060443815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1409_text_document +0.0001922448994056278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1410_text_document +0.00018809380854194413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1411_text_document +0.00019183325882742152 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1412_text_document +0.00018979529371331087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1413_text_document +0.00018194205843788177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1414_text_document +0.000185326810832552 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1415_text_document +0.00018768967790659056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1416_text_document +8.238038512980449e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1417_text_document +7.037628876350043e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1418_text_document +7.031761895460266e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1419_text_document +6.852561440270574e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1420_text_document +7.163053214543125e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1421_text_document +6.965337217248569e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1422_text_document +7.217926984135532e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1423_text_document +6.887448282655111e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1424_text_document +7.065036798913058e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1425_text_document +7.00112034634854e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1426_text_document +7.22545398101735e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1427_text_document +7.06581990215903e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1428_text_document +8.704526082745054e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1429_text_document +7.647895905010174e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1430_text_document +6.667599117230014e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1431_text_document +9.175439580281598e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1432_text_document +6.936196694178977e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1433_text_document +6.994579003243415e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1434_text_document +6.85501978720171e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1435_text_document +6.733846418731063e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1436_text_document +6.760126406073544e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1437_text_document +6.979877393600358e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1438_text_document +6.866399513844505e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1439_text_document +6.599386727589954e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1440_text_document +7.022110351565428e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1441_text_document +6.889110495186351e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1442_text_document +7.249533430962498e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1443_text_document +7.061312850517899e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1444_text_document +7.026495137417699e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1445_text_document +7.053710208774785e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1446_text_document +7.079302654666706e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1447_text_document +7.142821385554296e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1448_text_document +6.884074447800683e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1449_text_document +6.775299728680366e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1450_text_document +6.935640081273007e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1451_text_document +7.071164131398859e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1452_text_document +7.251697614402021e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1453_text_document +0.00012391766284956256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1454_text_document +6.876051279861284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1455_text_document +7.000563116437178e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1456_text_document +7.021430732464126e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1457_text_document +7.320305084935923e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1458_text_document +6.93854906426365e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1459_text_document +7.268065730933861e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1460_text_document +0.00015861223006440801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1461_text_document +0.00018316051674097559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1462_text_document +0.00018895109829526356 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1463_text_document +0.0001814266629730391 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1464_text_document +0.00019157095403912478 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1465_text_document +0.0001816102282477865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1466_text_document +0.00018024098024327291 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1467_text_document +0.00018881439877582162 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1468_text_document +0.00018219752655961166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1469_text_document +0.00018211134990984607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1470_text_document +0.00018153994220173833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1471_text_document +0.00018033319169939537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1472_text_document +0.00017832883141386175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1473_text_document +0.00018126942359083546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1474_text_document +0.0001780495979719729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1475_text_document +0.00017938729404825616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1476_text_document +0.00017959022144586935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1477_text_document +0.00017639465376427234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1478_text_document +0.00017486395149030808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1479_text_document +0.0001808240528085561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1480_text_document +0.00017274905690967175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1481_text_document +0.00017486036693577152 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1482_text_document +0.00017698269026135074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1483_text_document +0.00016841434450034874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1484_text_document +0.00016863721573351308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1485_text_document +0.00017483420640067329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1486_text_document +0.0001732828833128397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1487_text_document +0.00017295290907496933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1488_text_document +0.00017133232046800912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1489_text_document +0.00016977827391836668 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1490_text_document +0.0001740935435005184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1491_text_document +3.6341265268857285e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1492_text_document diff --git a/ALCF/data-lists/sunspot/data_file_list_peS2o.txt b/ALCF/data-lists/sunspot/data_file_list_peS2o.txt new file mode 100644 index 0000000000..dff1249cf5 --- /dev/null +++ b/ALCF/data-lists/sunspot/data_file_list_peS2o.txt @@ -0,0 +1,26 @@ +0.001258392312111664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0000_text_document +0.0012624388132337304 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0001_text_document +0.0012626279540316713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0002_text_document +0.0012611745647392154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0003_text_document +0.001261744080471196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0004_text_document +0.0012540395730196387 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0005_text_document +0.0012674794155994474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0006_text_document +0.0015181894311854882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0007_text_document +0.003353431842116585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0008_text_document +0.0033457523561418873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0009_text_document +0.0035360737173355393 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0010_text_document +0.0035328021064248917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0011_text_document +0.003537853575841124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0012_text_document +0.0033495442948704096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0013_text_document +0.0033515559656802623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0014_text_document +0.0035292441286648877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0015_text_document +0.0033472466636064995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0016_text_document +0.003347244907254542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0017_text_document +0.003361109976122766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0018_text_document +0.003527949940706846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0019_text_document +0.0033629959027952918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0020_text_document +0.003534363177394335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0021_text_document +0.0033534091101340303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0022_text_document +0.003362863367631581 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0023_text_document +0.0035295619700253587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0024_text_document +0.00039091085286111746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0025_text_document diff --git a/ALCF/data-lists/sunspot/data_file_list_reddit.txt b/ALCF/data-lists/sunspot/data_file_list_reddit.txt new file mode 100644 index 0000000000..644d717021 --- /dev/null +++ b/ALCF/data-lists/sunspot/data_file_list_reddit.txt @@ -0,0 +1,78 @@ +0.0011541728836721287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0000_text_document +0.001194214065746794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0001_text_document +0.0012074645870644872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0002_text_document +0.0011669676257397446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0003_text_document +0.0011730429598479002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0004_text_document +0.0011829645036126979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0005_text_document +0.0011882713613863669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0006_text_document +0.0011409601969657492 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0007_text_document +0.0011370779956530767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0008_text_document +0.0011566277261230336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0009_text_document +0.0011377530435595722 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0010_text_document +0.0011422212106036002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0011_text_document +0.0011359648236479313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0012_text_document +0.0011422819300771266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0013_text_document +0.0011613422088431185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0014_text_document +0.001149222546698594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0015_text_document +0.0011520428345756523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0016_text_document +0.0011408015787470732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0017_text_document +0.001145413257179254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0018_text_document +0.0011543340882314167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0019_text_document +0.0011397083750923865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0020_text_document +0.001163788652940794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0021_text_document +0.0011441686420414542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0022_text_document +0.0011429505546541332 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0023_text_document +0.00117471168582067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0024_text_document +0.0011456585273133617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0025_text_document +0.0011738665177335344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0026_text_document +0.0011646176186295262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0027_text_document +0.0011629386473461694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0028_text_document +0.0011421097688385183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0029_text_document +0.0011459477142114253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0030_text_document +0.0011756431096178663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0031_text_document +0.0011482680809577622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0032_text_document +0.0011445710176100962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0033_text_document +0.001142534803152167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0034_text_document +0.0011422043218494292 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0035_text_document +0.0011678344410475695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0036_text_document +0.0011562147470581413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0037_text_document +0.0011468122833549663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0038_text_document +0.0011532706690152916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0039_text_document +0.0011292882378850658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0040_text_document +0.0011300177059999066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0041_text_document +0.0011287171558685828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0042_text_document +0.0011295841562723513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0043_text_document +0.0011279954847952854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0044_text_document +0.0011283817109930107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0045_text_document +0.001128286479630481 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0046_text_document +0.0011276081740353844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0047_text_document +0.0011268985652144736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0048_text_document +0.0011261863340342809 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0049_text_document +0.0011248860240274238 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0050_text_document +0.0011253794147731645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0051_text_document +0.0011242857628861397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0052_text_document +0.0011228472942657042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0053_text_document +0.00112269047698053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0054_text_document +0.0011234938283922757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0055_text_document +0.0011230927745087202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0056_text_document +0.0011247141749506225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0057_text_document +0.0011241207913742775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0058_text_document +0.0011220187728072355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0059_text_document +0.0011227320045060405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0060_text_document +0.0011217839100677303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0061_text_document +0.0011210875921360617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0062_text_document +0.0011221651716921029 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0063_text_document +0.0011248396609954611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0064_text_document +0.0012275703827670792 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0065_text_document +0.0011056036331311184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0066_text_document +0.001107902944963784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0067_text_document +0.0010968114497626087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0068_text_document +0.0011027306309299484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0069_text_document +0.0010853624892717291 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0070_text_document +0.0011051858405711837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0071_text_document +0.0010808015771539223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0072_text_document +0.0010855928806935572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0073_text_document +0.0010442141182932184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0074_text_document +0.0011804749731815143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0075_text_document +0.0011670805522744465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0076_text_document +0.0008366052616529944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0077_text_document diff --git a/ALCF/data-lists/sunspot/data_file_list_stack.txt b/ALCF/data-lists/sunspot/data_file_list_stack.txt new file mode 100644 index 0000000000..cbaf3cedde --- /dev/null +++ b/ALCF/data-lists/sunspot/data_file_list_stack.txt @@ -0,0 +1,149 @@ +0.0010659025986423038 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0000_text_document +0.001089820700651703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0001_text_document +0.0010894690468995446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0002_text_document +0.0010893103153582777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0003_text_document +0.001092968830569157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0004_text_document +0.0010927822953669655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0005_text_document +0.0010948538530423937 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0006_text_document +0.0010914947459084862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0007_text_document +0.0011531345061061805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0008_text_document +0.0009273732822541429 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0009_text_document +0.0009298094568342398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0010_text_document +0.0009269985376241653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0011_text_document +0.0009299414467502114 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0012_text_document +0.0009281292496915194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0013_text_document +0.0009300797305068478 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0014_text_document +0.0009575658299825903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0015_text_document +0.001124706364232967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0016_text_document +0.0011201757618238954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0017_text_document +0.001126433347327465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0018_text_document +0.0011299837668245817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0019_text_document +0.001127851225271931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0020_text_document +0.0011265589698280143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0021_text_document +0.0011227970380980016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0022_text_document +0.001131300918127052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0023_text_document +0.00112588381546472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0024_text_document +0.0011692456277892793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0025_text_document +0.0011330744556493294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0026_text_document +0.001041946972706877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0027_text_document +0.0010493121881969634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0028_text_document +0.0009912570469629923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0029_text_document +0.0012717963903526445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0030_text_document +0.0014051955824199262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0031_text_document +0.0011248653480876683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0032_text_document +0.0015096975127629315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0033_text_document +0.001056885183600456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0034_text_document +0.0010523010671513575 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0035_text_document +0.001055691055690255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0036_text_document +0.0012434898779499373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0037_text_document +0.0009615620261395163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0038_text_document +0.0011689290747945063 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0039_text_document +0.0012610288149681123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0040_text_document +0.0012183045747008489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0041_text_document +0.0012232394891956877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0042_text_document +0.0012316862572191265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0043_text_document +0.001171858466558184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0044_text_document +0.0009288715082322405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0045_text_document +0.0009096255640660796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0046_text_document +0.0009098493089021282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0047_text_document +0.000908428701094243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0048_text_document +0.0009115948236386599 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0049_text_document +0.0009109761446993803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0050_text_document +0.0009097199236925156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0051_text_document +0.0009103946801923116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0052_text_document +0.0009109038594994949 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0053_text_document +0.0009098133932243314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0054_text_document +0.0009111744494635876 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0055_text_document +0.0008961257268851344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0056_text_document +0.0008499219991848833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0057_text_document +0.000848817192629684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0058_text_document +0.0008469931268429987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0059_text_document +0.0008487804660301039 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0060_text_document +0.0008535293627452302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0061_text_document +0.0008508082359285502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0062_text_document +0.000847764423021283 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0063_text_document +0.0008661814491784624 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0064_text_document +0.0012598427266996145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0065_text_document +0.0015411645064455006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0066_text_document +0.0015500690406153115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0067_text_document +0.0010431702414192465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0068_text_document +0.0010103298065465376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0069_text_document +0.0009173697763272889 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0070_text_document +0.0009149081716719212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0071_text_document +0.0009223001515794829 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0072_text_document +0.0009231205497115238 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0073_text_document +0.0009205400022638854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0074_text_document +0.000921891356231865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0075_text_document +0.0009206550523916788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0076_text_document +0.000919101114727538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0077_text_document +0.0009189314293443922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0078_text_document +0.0009187845413397615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0079_text_document +0.0009212488966514148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0080_text_document +0.0009193937503280587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0081_text_document +0.0013803871878583557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0082_text_document +0.0009950213666737198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0083_text_document +0.000927893134699511 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0084_text_document +0.0009256115426841411 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0085_text_document +0.0009245248815034989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0086_text_document +0.0009239324963431647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0087_text_document +0.00093017264782812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0088_text_document +0.0009246774971430524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0089_text_document +0.0009246651817682976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0090_text_document +0.0009220962135479767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0091_text_document +0.0009218191222144196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0092_text_document +0.0009271314108370893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0093_text_document +0.0011393174361636815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0094_text_document +0.0010056046636817732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0095_text_document +0.000985188940051775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0096_text_document +0.0009834908338499898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0097_text_document +0.0009841221104671695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0098_text_document +0.0009846688252964021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0099_text_document +0.0009846837273836892 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0100_text_document +0.000983200779763785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0101_text_document +0.000983626091844726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0102_text_document +0.0009227550215195058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0103_text_document +0.0008517634745985513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0104_text_document +0.0009820984183696825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0105_text_document +0.001062956613371643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0106_text_document +0.0009446580160861343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0107_text_document +0.000849273787178016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0108_text_document +0.0010838798124933814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0109_text_document +0.0016259767652594482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0110_text_document +0.0009261166233974987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0111_text_document +0.0013044836937627727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0112_text_document +0.0017111272224419217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0113_text_document +0.0017274616815008634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0114_text_document +0.0017204942871235126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0115_text_document +0.0017119592701771347 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0116_text_document +0.0016979912192342588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0117_text_document +0.001701886248500233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0118_text_document +0.0017227272126357288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0119_text_document +0.0017014517255794117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0120_text_document +0.0016995002579026628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0121_text_document +0.0016958447424626011 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0122_text_document +0.0017111887981161064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0123_text_document +0.0017172926007805738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0124_text_document +0.0016938659465618113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0125_text_document +0.0016877576226485259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0126_text_document +0.0017144361080061983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0127_text_document +0.0017173753931755767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0128_text_document +0.001713308056226134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0129_text_document +0.0017126769067653286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0130_text_document +0.0017129095633438736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0131_text_document +0.001704961253905759 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0132_text_document +0.0009282082505873367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0133_text_document +0.0007973220067601047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0134_text_document +0.0008407445714413182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0135_text_document +0.0008403726198530843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0136_text_document +0.0008371632157580058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0137_text_document +0.0013060325919558903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0138_text_document +0.0014100060700040244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0139_text_document +0.0008750222172256031 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0140_text_document +0.0016918433420911735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0141_text_document +0.001838605753011377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0142_text_document +0.0016004536814984726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0143_text_document +0.0011738110086663097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0144_text_document +0.0011269892510041232 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0145_text_document +0.0011251329530758676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0146_text_document +0.0011788404279377853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0147_text_document +0.0007876495048700586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0148_text_document diff --git a/ALCF/data-lists/sunspot/data_file_list_wiki.txt b/ALCF/data-lists/sunspot/data_file_list_wiki.txt new file mode 100644 index 0000000000..65169566eb --- /dev/null +++ b/ALCF/data-lists/sunspot/data_file_list_wiki.txt @@ -0,0 +1,2 @@ +0.0035577638528123345 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/en_simple_wiki_v0-0000_text_document +0.0018422361471876658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/en_simple_wiki_v0-0001_text_document From 80dc91c416e08cd72ec3db74b49976e2767adead Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 19 Apr 2024 13:17:25 -0500 Subject: [PATCH 189/268] Add support for `--optimizer ipex.{fused}lamb` --- megatron/optimizer/__init__.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index 80adeb8521..48f2737a06 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -3,7 +3,7 @@ from deepspeed.accelerator import get_accelerator import torch -from typing import Callable, Any +from typing import Callable, Any, Iterable, Union from megatron import get_args from .distrib_optimizer import DistributedOptimizer @@ -16,7 +16,7 @@ def get_param_groups( - modules: torch.nn.Module | iter[torch.nn.Module], + modules: Union[torch.nn.Module, Iterable[torch.nn.Module]], no_weight_decay_cond: Callable[[str, torch.Tensor], bool], scale_lr_cond: Callable[[str, torch.Tensor], bool], lr_mult: Any, @@ -110,6 +110,7 @@ def get_megatron_optimizer( lr_mult=1.0 ): args = get_args() + assert args is not None # Base optimizer. param_groups = get_param_groups( @@ -218,6 +219,25 @@ def optimizer_hook(p): if p.requires_grad: p.register_post_accumulate_grad_hook(optimizer_hook) layer_wise_flag = True + elif str(args.optimizer) == 'ipex.lamb': + from intel_extension_for_pytorch.optim._lamb import Lamb + optimizer = Lamb( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + ) + elif str(args.optimizer) == 'ipex.fusedlamb': + from intel_extension_for_pytorch.optim._lamb import Lamb + optimizer = Lamb( + param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps, + fused=True, + ) elif str(args.optimizer).lower() == 'ds.fusedlamb': from deepspeed.ops.lamb import FusedLamb optimizer = FusedLamb( From 36af9fc9cf90c0f41cbb60d920302f10ca9f525e Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 19 Apr 2024 13:17:36 -0500 Subject: [PATCH 190/268] Add support for `--optimizer ipex.{fused}lamb` --- megatron/arguments.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/arguments.py b/megatron/arguments.py index 70dfa88ef6..702a4bad09 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -924,6 +924,8 @@ def _add_training_args(parser): 'adamw', 'sgd', 'ds.fusedlamb', + 'ipex.lamb', + 'ipex.fusedlamb', 'apex.adam', 'apex.sgd', 'adamwschedulefree', From 997c39fb3a80fe684e878fbdd40ab1429f324886 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 19 Apr 2024 16:49:13 -0500 Subject: [PATCH 191/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 151c3e857a..d124ecb242 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -144,12 +144,13 @@ setArgs() { } # +---------------------------------------+ -# | 1. Git clone `ezpz` (if not found) | -# | 2. Install `ezpz` (if not installed) | +# | 1. Git clone ezpz (if not found) | +# | 2. Install ezpz (if not installed) | # +---------------------------------------+ ezpz() { - if [[ ! -d ezpz ]]; then - git clone https://github.com/saforem2/ezpz + if [[ ! -d "${PBS_O_WORKDIR}/deps/ezpz" ]]; then + mkdir -p "${PBS_O_WORKDIR}/deps" + git clone https://github.com/saforem2/ezpz "${PBS_O_WORKDIR}/deps" else echo "Found ezpz!" fi @@ -157,12 +158,12 @@ ezpz() { echo "Has ezpz installed. Nothing to do." else echo "Does not have ezpz installed. Installing..." - echo "Using $(which python3) to install \`ezpz\`:" - python3 -m pip install -e ezpz > ezpz-install.log 2>&1 + echo "Using $(which python3) to install ezpz:" + python3 -m pip install -e "${PBS_O_WORKDIR}/edps/ezpz" # > ezpz-install.log 2>&1 fi echo "Done with ezpz." - # source ezpz/src/ezpz/bin/savejobenv || exit # > /tmp/savejobenv.log 2>&1 || exit - # source ezpz/src/ezpz/bin/getjobenv || exit + source ezpz/src/ezpz/bin/savejobenv > /tmp/savejobenv.log 2>&1 || exit + source ezpz/src/ezpz/bin/getjobenv || exit } # +------------------------------------------------------------------------+ @@ -270,9 +271,11 @@ setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------------- if [[ $(hostname) == x4* ]]; then # ---- [AURORA] ---- dfl_fallback="/home/foremans/anl_24_release_q4/llm.devkit/Megatron-DeepSpeed/data_file_list_reweighted.txt" elif [[ $(hostname) == x1* ]]; then - dfl_fallback="/gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_file_list_reweighted.txt" + dfl_fallback="../ALCF/data-lists/sunspot/data_file_list_books.txt" + # dfl_fallback="/gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_file_list_reweighted.txt" elif [[ $(hostname) == x3* ]]; then - dfl_fallback="/eagle/datasets/dolma/data_file_list_reweighted.txt" + # dfl_fallback="/eagle/datasets/dolma/data_file_list_reweighted.txt" + dfl_fallback="../ALCF/data-lists/polaris/data_file_list_books.txt" elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then dfl_fallback="${SLURM_SUBMIT_DIR}/genslm-subsample.txt" else @@ -280,7 +283,7 @@ setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------------- fi dfl="${1:-${dfl_fallback}}" # dfl_fallback="/eagle/datasets/dolma/data_file_list_reweighted.txt" - printf "Calling: \`setData()\` with %s\n" "${dfl}" + printf "Calling: setData() with %s\n" "${dfl}" ndocs=$(wc -l < "${dfl}") ws=$(sumWeights "${dfl}") dfl_stem=$(echo "${dfl}" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") From e179fe05af7c19e5a9b590be57b0ffad8da38723 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 22 Apr 2024 21:56:57 -0500 Subject: [PATCH 192/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index d124ecb242..216798876b 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -1,5 +1,16 @@ #!/bin/bash --login +if [[ -n "${PBS_O_WORKDIR}" ]]; then + WORKING_DIR="${PBS_O_WORKDIR}" +elif [[ -n "${SLURM_SUBMIT_DIR}" ]]; then + WORKING_DIR="${SLURM_SUBMIT_DIR}" +else + WORKING_DIR="$(realpath $(pwd))" +fi + +export WORKING_DIR="${WORKING_DIR}" +printf "Using WORKING_DIR: %s" ${WORKING_DIR} + printJobInfo() { echo "++++++++++++++++++++++++++++++++++++++++++++++++++" @@ -50,7 +61,7 @@ setParams() { export BE="${CCL}" # BE = CCL export DTYPE=${DTYPE:-bf16} # DTYPE: bf16 MICRO_BATCH=${MICRO_BATCH:-4} # MICRO_BATCH = 4 - export WORKING_DIR="${PBS_O_WORKDIR}" + # export WORKING_DIR="${PBS_O_WORKDIR}" if [[ -z "${NO_FLASH_ATTN}" ]]; then LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn" fi @@ -62,7 +73,7 @@ setParams() { # export DTYPE=${DTYPE:-bf16} # DTYPE: BF16 ?? export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 MICRO_BATCH=${MICRO_BATCH:-8} # MICRO_BATCH = 8 - export WORKING_DIR="${PBS_O_WORKDIR}" + # export WORKING_DIR="${PBS_O_WORKDIR}" if [[ -z "${NO_FLASH_ATTN}" ]]; then LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn-v2" fi @@ -73,7 +84,7 @@ setParams() { export BE="${NCCL}" export DTYPE="${DTYPE:-bf16}" MICRO_BATCH="${MICRO_BATCH:-8}" - export WORKING_DIR="${SLURM_SUBMIT_DIR}" + # export WORKING_DIR="${SLURM_SUBMIT_DIR}" if [[ -z "${NO_FLASH_ATTN}" ]]; then LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn-v2" fi @@ -162,8 +173,8 @@ ezpz() { python3 -m pip install -e "${PBS_O_WORKDIR}/edps/ezpz" # > ezpz-install.log 2>&1 fi echo "Done with ezpz." - source ezpz/src/ezpz/bin/savejobenv > /tmp/savejobenv.log 2>&1 || exit - source ezpz/src/ezpz/bin/getjobenv || exit + source "${WORKING_DIR}/deps/ezpz/src/ezpz/bin/savejobenv" > /tmp/savejobenv.log 2>&1 || exit + source "${WORKING_DIR}/deps/ezpz/src/ezpz/bin/getjobenv" || exit } # +------------------------------------------------------------------------+ @@ -236,11 +247,15 @@ setEnv() { fi # ----- [Polaris] --------------------------------------- elif [[ $(hostname) == x3* ]]; then - echo "Running on Polaris !!" - # ---- [load conda] --------------------- - module load conda/2023-10-04; conda activate cu118-pt221 ; unset PYTHONUSERBASE - if [[ -d "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221" ]]; then - source "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221/bin/activate" + if [[ "${PBS_O_HOST}" == sirius* ]]; then + export MACHINE="Running on Sirius !!" + else + echo "Running on Polaris !!" + # ---- [load conda] --------------------- + # module load conda/2023-10-04; conda activate cu118-pt221 ; unset PYTHONUSERBASE + if [[ -d "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221" ]]; then + source "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221/bin/activate" + fi fi elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then echo "Running on Perlmutter !!" @@ -254,8 +269,8 @@ setEnv() { } makeHostfiles() { - source ezpz/src/ezpz/bin/savejobenv || exit #> /tmp/savejobenv.log 2>&1 & - source ezpz/src/ezpz/bin/getjobenv || exit + # source "${WORKING_DIR}/deps/ezpz/src/ezpz/bin/savejobenv" || exit #> /tmp/savejobenv.log 2>&1 & + # source "${WORKING_DIR}/deps/ezpz/src/ezpz/bin/getjobenv" || exit export GPUS_PER_NODE="${GPUS_PER_NODE:-${NGPU_PER_HOST:-${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}}}" # ---- Make MPICH hostfile ---------------- hf="${HOSTFILE:-${PBS_NODEFILE}}" @@ -271,11 +286,11 @@ setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------------- if [[ $(hostname) == x4* ]]; then # ---- [AURORA] ---- dfl_fallback="/home/foremans/anl_24_release_q4/llm.devkit/Megatron-DeepSpeed/data_file_list_reweighted.txt" elif [[ $(hostname) == x1* ]]; then - dfl_fallback="../ALCF/data-lists/sunspot/data_file_list_books.txt" + dfl_fallback="${WORKING_DIR}/ALCF/data-lists/sunspot/data_file_list_books.txt" # dfl_fallback="/gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_file_list_reweighted.txt" elif [[ $(hostname) == x3* ]]; then # dfl_fallback="/eagle/datasets/dolma/data_file_list_reweighted.txt" - dfl_fallback="../ALCF/data-lists/polaris/data_file_list_books.txt" + dfl_fallback="${WORKING_DIR}/ALCF/data-lists/polaris/data_file_list_books.txt" elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then dfl_fallback="${SLURM_SUBMIT_DIR}/genslm-subsample.txt" else From 133f24484f302bb8fc439b6bba772eb0dec9653a Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 23 Apr 2024 16:07:10 -0500 Subject: [PATCH 193/268] Remove `apex` deps from `megatron/*` --- megatron/model/__init__.py | 12 +++++++++--- megatron/optimizer/clip_grads.py | 10 +++++++--- megatron/optimizer/distrib_optimizer.py | 7 ++++--- megatron/utils.py | 13 ++++++++----- 4 files changed, 28 insertions(+), 14 deletions(-) diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py index 2306749fcb..141c901ffa 100644 --- a/megatron/model/__init__.py +++ b/megatron/model/__init__.py @@ -1,12 +1,18 @@ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -from deepspeed.accelerator.real_accelerator import get_accelerator -if get_accelerator().device_name() == 'cuda': +# from deepspeed.accelerator.real_accelerator import get_accelerator +# if get_accelerator().device_name() == 'cuda': +try: from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm from apex.normalization import MixedFusedRMSNorm as RMSNorm -else: + HAS_APEX = True +except Exception: + HAS_APEX = False from .rmsnorm import RMSNorm from torch.nn import LayerNorm +# else: +# from .rmsnorm import RMSNorm +# from torch.nn import LayerNorm from .distributed import DistributedDataParallel from .bert_model import BertModel diff --git a/megatron/optimizer/clip_grads.py b/megatron/optimizer/clip_grads.py index afec8f220c..b5141d0059 100644 --- a/megatron/optimizer/clip_grads.py +++ b/megatron/optimizer/clip_grads.py @@ -8,10 +8,14 @@ except ModuleNotFoundError: from torch import inf as inf -from deepspeed.accelerator import get_accelerator -if get_accelerator().device_name() == 'cuda': +# from deepspeed.accelerator import get_accelerator +# if get_accelerator().device_name() == 'cuda': +try: from apex.multi_tensor_apply import multi_tensor_applier import amp_C + HAS_APEX = True +except Exception: + HAS_APEX = False from megatron.model.module import param_is_not_shared from megatron.core.tensor_parallel import param_is_not_tensor_parallel_duplicate @@ -71,7 +75,7 @@ def clip_grad_norm_fp32(parameters, grads_for_norm, else: if norm_type == 2.0: - if get_accelerator().device_name() == 'cuda': + if get_accelerator().device_name() == 'cuda' and HAS_APEX: dummy_overflow_buf = torch.cuda.IntTensor([0]) # Use apex's multi-tensor applier for efficiency reasons. # Multi-tensor applier takes a function and a list of list diff --git a/megatron/optimizer/distrib_optimizer.py b/megatron/optimizer/distrib_optimizer.py index 1aeeac3444..10331607d9 100644 --- a/megatron/optimizer/distrib_optimizer.py +++ b/megatron/optimizer/distrib_optimizer.py @@ -2,10 +2,11 @@ """Megatron distributed optimizer.""" -from deepspeed.accelerator import get_accelerator -if get_accelerator().device_name() == 'cuda': +# from deepspeed.accelerator import get_accelerator +# if get_accelerator().device_name() == 'cuda': +try: from apex.optimizers import FusedAdam as Adam -else: +except Exception: from torch.optim import Adam import math diff --git a/megatron/utils.py b/megatron/utils.py index 97294070af..f6a293281a 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -10,8 +10,12 @@ from deepspeed.accelerator import get_accelerator if get_accelerator().device_name() == 'cuda': - from apex.multi_tensor_apply import multi_tensor_applier - import amp_C + try: + from apex.multi_tensor_apply import multi_tensor_applier + import amp_C + HAS_APEX = True + except Exception: + HAS_APEX = False from megatron import ( get_args, @@ -74,15 +78,14 @@ def calc_params_l2_norm(model): # Calculate norm dummy_overflow_buf = get_accelerator().IntTensor([0]) - if get_accelerator().device_name() == 'cuda': - + if get_accelerator().device_name() == 'cuda' and HAS_APEX: norm, _ = multi_tensor_applier( amp_C.multi_tensor_l2norm, dummy_overflow_buf, [params_data], False # no per-parameter norm ) - else : + else: norm = torch.norm(params_data,p=2.0) norm_2 = norm * norm # Sum across all model-parallel GPUs. From 42a27fb40cd5758b3aa8e03bcf94d0e896e45fce Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 23 Apr 2024 16:09:54 -0500 Subject: [PATCH 194/268] Move `generate_config.sh` logic into `ALCF/helpers.sh` --- ALCF/helpers.sh | 179 +++++++++++++++++++++++++++++++++++++++++++-- generate_config.sh | 172 ------------------------------------------- 2 files changed, 173 insertions(+), 178 deletions(-) delete mode 100644 generate_config.sh diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 216798876b..18fe965f82 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -95,7 +95,11 @@ setParams() { export DTYPE="${DTYPE:-bf16}" export OPT="${OPT:-adamw}" export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" - export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} + NHOSTS=$(wc -l < "${HOSTFILE}") + NGPU_PER_HOST=$(python3 -c 'import ezpz as ez; print(ez.get_gpus_per_node())') + export WORLD_SIZE="${WORLD_SIZE:-$(( NHOSTS * NGPU_PER_HOST ))}" + # export WORLD_SIZE="${WORLD_SIZE:-${NGPUS:-$(( ))}}" + # export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} # ---- Llama2 7B Config ------------------------------ export MODEL_KEY="Llama-7B" export HEADS=${HEADS:-${NHEADS:-32}} @@ -207,11 +211,13 @@ setOutput() { buildDSconfig() { # ---- Build DeepSpeed Config --------------------------------- - export DS_CONFIG="ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" + export DS_CONFIG="${WORKING_DIR}/ds-configs/ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" + mkdir -p $(dirname "${DS_CONFIG}") echo "DS_CONFIG: ${DS_CONFIG}" printf "ZS: %s, CPU_OPTIMIZER: %s, MB: %s, GB: %s, PP: %s, DTYPE: %s" "${ZERO_STAGE}" "${CPU_OPTIMIZER}" "${MICRO_BATCH}" "${GLOBAL_BATCH}" "${PP}" "${DTYPE}" - working_dir="${PBS_O_WORKDIR:-${SLURM_SUBMIT_DIR:-$(pwd)}}" - bash "${working_dir}/generate_config.sh" "${DS_CONFIG}" + # working_dir="${PBS_O_WORKDIR:-${SLURM_SUBMIT_DIR:-$(pwd)}}" + generateDSconfig "${DS_CONFIG}" + # bash "${WORKING_DIR}/ALCF/generate_ds_config.sh" "${DS_CONFIG}" # ------------------------------------------------------------- } @@ -302,8 +308,8 @@ setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------------- ndocs=$(wc -l < "${dfl}") ws=$(sumWeights "${dfl}") dfl_stem=$(echo "${dfl}" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") - dcp="${HERE}/.cache/${dfl_stem}/index-cache" - mkdir -p dcp + dcp=".cache/${dfl_stem}/index-cache" + # mkdir -p dcp export DATA_FILE_LIST="${dfl}" export NUM_DOCS="${ndocs}" export WEIGHT_SUM="${ws}" @@ -319,6 +325,167 @@ setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------------- echo "--------------------" } +generateDSconfig() { + for v in "$GLOBAL_BATCH" "$MICRO_BATCH" "$GRAD_ACC_STEPS" "$ZERO_STAGE" \ + "$PP" "$DTYPE" + do + if [ -z $v ]; then + echo "Please export required envs before execute $0" + exit 1 + fi + done + if [ $# -ne 1 ]; then + echo "Usage: $0 config_file" + exit 1 + fi + # \"optimizer\": { + # \"type\": \"AdamW\", + # \"params\": { + # \"lr\": ${LR}, + # \"beta1\": 0.9, + # \"beta2\": 0.95, + # \"eps\": 1e-5, + # \"weight_decay\": 1e-1 + # } + # }, + # \"scheduler\": { + # \"type\": \"WarmupLR\", + # \"params\": { + # \"warmup_min_lr\": 0.00003, + # \"warmup_max_lr\": 0.0003, + # \"warmup_num_steps\": 5000 + # } + # }, + extra="" + common="\ + \"train_batch_size\": $GLOBAL_BATCH, + \"train_micro_batch_size_per_gpu\": $MICRO_BATCH, + \"steps_per_print\": 1, + \"gradient_accumulation_steps\": $GRAD_ACC_STEPS, + \"zero_allow_untested_optimizer\": true, + \"gradient_clipping\": 1.0, + \"activation_checkpointing\": { + \"partition_activations\": true, + \"contiguous_memory_optimization\": false + }, + \"wall_clock_breakdown\": false," + flops_profiler="\ + \"flops_profiler\": { + \"enabled\": true, + \"profile_step\": 4, + \"module_depth\": -1, + \"top_modules\": 1, + \"detailed\": true, + \"output_file\": null + }" + if [[ $DTYPE == "bf16" ]]; then + dtype="\ + \"communication_data_type\": \"bf16\", + \"fp16\": { + \"enabled\": false, + \"loss_scale\": 0, + \"loss_scale_window\": 1000, + \"hysteresis\": 2, + \"min_loss_scale\": 1 + }, + \"bfloat16\": { + \"enabled\": true, + \"loss_scale\": 1.0 + }," + elif [[ $DTYPE == "fp16" ]]; then + dtype="\ + \"communication_data_type\": \"fp16\", + \"fp16\": { + \"enabled\": true, + \"loss_scale\": 0, + \"loss_scale_window\": 1000, + \"hysteresis\": 2, + \"min_loss_scale\": 1 + }, + \"bfloat16\": { + \"enabled\": false, + \"loss_scale\": 1.0 + }," + else + dtype="\"communication_data_type\": \"fp32\"," + fi + if [ $ZERO_STAGE == 3 ]; then + zero="\ + \"zero_optimization\": { + \"stage\": 3, + \"reduce_scatter\": false, + \"mics_shard_size\": 4, + \"mics_hierarchical_params_gather\": true, + \"stage3_max_live_parameters\": 3e9, + \"stage3_max_reuse_distance\": 3e9, + \"stage3_param_persistence_threshold\": 1e5, + \"stage3_prefetch_bucket_size\": 5e7, + \"contiguous_gradients\": true, + \"overlap_comm\": true, + \"reduce_bucket_size\": 90000000, + \"sub_group_size\": 1e9, + \"offload_optimizer\": { + \"device\": \"none\", + \"buffer_count\": 4, + \"pipeline_read\": false, + \"pipeline_write\": false, + \"pin_memory\": true + } + }," + # elif [[ $ZERO_STAGE == 2 ]]; then + elif [ "${ZERO_STAGE}" == 2 ] || [ "${ZERO_STAGE}" == 1 ]; then + if [[ -n "${CPU_OPTIMIZER}" ]]; then + echo "!!!! CAUGHT CPU_OPTIMIZER !!!!" + zero="\ + \"zero_optimization\": { + \"stage\": $ZERO_STAGE, + \"offload_optimizer\": { + \"device\": \"cpu\" + } + }," + else + zero="\ + \"zero_optimization\": { + \"stage\": $ZERO_STAGE + }," + fi + # elif [[ $ZERO_STAGE == 1 ]]; then + if [[ $PP > 1 ]]; then + extra="\ + \"data_types\": { + \"grad_accum_dtype\": \"fp32\" + }, + \"comms_logger\": { + \"enabled\": true, + \"verbose\": false, + \"prof_all\": true, + \"debug\": false + }," + else + # echo 'please add the config for zero_stage 1 without pipeline-parallelism' + extra="\ + \"comms_logger\": { + \"enabled\": true, + \"verbose\": false, + \"prof_all\": true, + \"debug\": false + }," + fi + else + echo 'Please add the correct config set!!!' + fi +# flops_profiler must at the end because no ',' is allowed at the end +cat < $1 +{ +$common +$zero +$dtype +$extra +$flops_profiler +} +EOT +} + printBlack() { printf "\e[1;30m%s\e[0m\n" "$@" } diff --git a/generate_config.sh b/generate_config.sh deleted file mode 100644 index b164b5e610..0000000000 --- a/generate_config.sh +++ /dev/null @@ -1,172 +0,0 @@ -#!/bin/bash --login - -for v in "$GLOBAL_BATCH" "$MICRO_BATCH" "$GRAD_ACC_STEPS" "$ZERO_STAGE" \ - "$PP" "$DTYPE" -do - if [ -z $v ]; then - echo "Please export required envs before execute $0" - exit 1 - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 config_file" - exit 1 -fi - -# \"optimizer\": { -# \"type\": \"AdamW\", -# \"params\": { -# \"lr\": ${LR}, -# \"beta1\": 0.9, -# \"beta2\": 0.95, -# \"eps\": 1e-5, -# \"weight_decay\": 1e-1 -# } -# }, -# \"scheduler\": { -# \"type\": \"WarmupLR\", -# \"params\": { -# \"warmup_min_lr\": 0.00003, -# \"warmup_max_lr\": 0.0003, -# \"warmup_num_steps\": 5000 -# } -# }, - -extra="" -common="\ - \"train_batch_size\": $GLOBAL_BATCH, - \"train_micro_batch_size_per_gpu\": $MICRO_BATCH, - \"steps_per_print\": 1, - \"gradient_accumulation_steps\": $GRAD_ACC_STEPS, - \"zero_allow_untested_optimizer\": true, - \"gradient_clipping\": 1.0, - \"activation_checkpointing\": { - \"partition_activations\": true, - \"contiguous_memory_optimization\": false - }, - \"wall_clock_breakdown\": false," - -flops_profiler="\ - \"flops_profiler\": { - \"enabled\": true, - \"profile_step\": 4, - \"module_depth\": -1, - \"top_modules\": 1, - \"detailed\": true, - \"output_file\": null - }" - -if [[ $DTYPE == "bf16" ]]; then -dtype="\ - \"communication_data_type\": \"bfp16\", - \"fp16\": { - \"enabled\": false, - \"loss_scale\": 0, - \"loss_scale_window\": 1000, - \"hysteresis\": 2, - \"min_loss_scale\": 1 - }, - \"bfloat16\": { - \"enabled\": true, - \"loss_scale\": 1.0 - }," -elif [[ $DTYPE == "fp16" ]]; then -dtype="\ - \"communication_data_type\": \"fp16\", - \"fp16\": { - \"enabled\": true, - \"loss_scale\": 0, - \"loss_scale_window\": 1000, - \"hysteresis\": 2, - \"min_loss_scale\": 1 - }, - \"bfloat16\": { - \"enabled\": false, - \"loss_scale\": 1.0 - }," -else - dtype="\"communication_data_type\": \"fp32\"," -fi - -if [ $ZERO_STAGE == 3 ]; then -zero="\ - \"zero_optimization\": { - \"stage\": 3, - \"reduce_scatter\": false, - \"mics_shard_size\": 4, - \"mics_hierarchical_params_gather\": true, - \"stage3_max_live_parameters\": 3e9, - \"stage3_max_reuse_distance\": 3e9, - \"stage3_param_persistence_threshold\": 1e5, - \"stage3_prefetch_bucket_size\": 5e7, - \"contiguous_gradients\": true, - \"overlap_comm\": true, - \"reduce_bucket_size\": 90000000, - \"sub_group_size\": 1e9, - \"offload_optimizer\": { - \"device\": \"none\", - \"buffer_count\": 4, - \"pipeline_read\": false, - \"pipeline_write\": false, - \"pin_memory\": true - } - }," - -# elif [[ $ZERO_STAGE == 2 ]]; then -elif [ "${ZERO_STAGE}" == 2 ] || [ "${ZERO_STAGE}" == 1 ]; then - -if [[ -n "${CPU_OPTIMIZER}" ]]; then -echo "!!!! CAUGHT CPU_OPTIMIZER !!!!" - -zero="\ - \"zero_optimization\": { - \"stage\": $ZERO_STAGE, - \"offload_optimizer\": { - \"device\": \"cpu\" - } - }," - -else -zero="\ - \"zero_optimization\": { - \"stage\": $ZERO_STAGE - }," -fi - -# elif [[ $ZERO_STAGE == 1 ]]; then -if [[ $PP > 1 ]]; then - extra="\ - \"data_types\": { - \"grad_accum_dtype\": \"fp32\" - }, - \"comms_logger\": { - \"enabled\": true, - \"verbose\": false, - \"prof_all\": true, - \"debug\": false - }," -else - # echo 'please add the config for zero_stage 1 without pipeline-parallelism' - extra="\ - \"comms_logger\": { - \"enabled\": true, - \"verbose\": false, - \"prof_all\": true, - \"debug\": false - }," -fi -else - echo 'Please add the correct config set!!!' -fi - -# flops_profiler must at the end because no ',' is allowed at the end -cat < $1 -{ -$common -$zero -$dtype -$extra -$flops_profiler -} -EOT From 3be7efc120849627404da4607f20103089532064 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 23 Apr 2024 16:25:46 -0500 Subject: [PATCH 195/268] Add option to launch with `mpiexec` --- ALCF/helpers.sh | 2 +- pretrain_gpt_alcf.py | 27 ++++++++++++++++----------- train_llama_alcf.sh | 32 ++++++++++++++++++++++++++++---- 3 files changed, 45 insertions(+), 16 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 18fe965f82..be04c08201 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -9,7 +9,7 @@ else fi export WORKING_DIR="${WORKING_DIR}" -printf "Using WORKING_DIR: %s" ${WORKING_DIR} +printf "Using WORKING_DIR: %s\n" ${WORKING_DIR} printJobInfo() { diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 0139330277..6530340c19 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -39,20 +39,19 @@ import ezpz as ez -# ---- SETUP DISTRIBUTED COMMS ---- -# RANK = ez.setup_torch( -# backend='deepspeed', -# port='5432', -# ) -RANK = ez.get_rank() +# ---- [SETUP COMMS] ------------------------ +RANK = ez.setup_torch(backend="deepspeed") +# RANK = ez.get_rank() WORLD_SIZE = ez.get_world_size() +LOCAL_RANK = ez.get_local_rank() DEVICE = ez.get_torch_device() - -# --- TURN OFF LOGGER ON ALL RANK != 0 ---- +if torch.cuda.is_available(): + torch.cuda.set_device(LOCAL_RANK) +# ------------------------------------------- +# --- [TURN OFF LOGGER ON ALL RANK != 0] ---- log = get_logger(__name__) log.setLevel("INFO") if RANK == 0 else log.setLevel("CRITICAL") - -# ---- SETUP WANDB FROM RANK 0 ---------------- +# ---- [SETUP WANDB FROM RANK 0] -------------- WANDB_MODE = os.environ.get('WANDB_MODE', None) DISABLE_WANDB = ( WANDB_MODE is not None and str(WANDB_MODE).lower() == 'disabled' @@ -70,7 +69,7 @@ print('--------------------------------------------------') print(f"Setting up W&B from: {RANK} with {project_name}") print('--------------------------------------------------') - ez.setup_wandb(project_name=project_name) + _ = ez.setup_wandb(project_name=project_name) def model_provider(pre_process=True, post_process=True): @@ -163,6 +162,12 @@ def model_provider(pre_process=True, post_process=True): print_rank_0(80 * '-') see_memory_usage("After Building Model", force=True) if wandb.run is not None: + tbdir = args.tensorboard_dir + # tbdir = args.getattr('tensorboard_dir', None) + if tbdir is not None: + log.info(f'Patching tensorboard from {tbdir}') + wandb.tensorboard.patch(root_logdir=tbdir) + wandb.run.config.update({'num_params': num_params}) if "args" not in wandb.run.config: log.info( diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index ce18842850..09b175b1cd 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -29,7 +29,11 @@ sourceFile "${HERE}/ALCF/helpers.sh" || exit setEnv || exit # 1. load `conda` environment saveDSenv || exit # 2. save env vars to `.deepspeed_env` ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars -makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` +if [[ -z "${HOSTFILE}" ]]; then + makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` +else + echo "!! USING CUSTOM HOSTFILE FROM: ${HOSTFILE}" +fi setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} @@ -65,10 +69,30 @@ mkdir -p "${TBDIR}" # --num-workers 0 \ # aprun -n "${NGPUS}" -N "${NGPU_PER_HOST}" --pmi=pmix ${PBS_O_WORKDIR}/local_rank.sh - # ${DIST_LAUNCH} $(which python3) ${EXEC} \ # yeet="${DIST_LAUNCH} ./local_rank.sh" + # deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ + # +# export MPICH_GPU_SUPPORT_ENABLED=1 +# export CUDA_DEVICE_MAX_CONNECTIONS=1 +# export NCCL_DEBUG=INFO + # deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ + # +data_cache_path="${CKPT_DIR}/${DATA_CACHE_PATH}" +mkdir -p "${data_cache_path}" + +if [[ -n "${DIST_LAUNCH}" ]]; then + LAUNCHER="${DIST_LAUNCH} python3 ${EXEC}" +else + LAUNCHER="deepspeed --hostfile $hfds --launcher MPICH ${EXEC}" +fi + +rstr=$(printRed "Launching with:") +gstr=$(printGreen " ${LAUNCHER}") +printf "%s %s\n" "${rstr}" "${gstr}" + + # ${DIST_LAUNCH} python3 ${EXEC} \ run_cmd=" - deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ + ${LAUNCHER} \ --$DTYPE \ --optimizer ${OPT} \ --split 100,0,0 \ @@ -103,7 +127,7 @@ run_cmd=" --global-batch-size ${GLOBAL_BATCH} \ --pipeline-model-parallel-size ${PP} \ --num-key-value-heads ${NUM_KV_HEAD} \ - --data-cache-path ${DATA_CACHE_PATH} \ + --data-cache-path ${data_cache_path} \ --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ --tokenizer-model ${TOKENIZER_MODEL} \ ${LLAMA_ARGS} \ From a8a9a591d72f1177a78ca3639c1eebca9515eff8 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 23 Apr 2024 23:01:56 -0500 Subject: [PATCH 196/268] Update `train_llama_alcf.sh` --- train_llama_alcf.sh | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index 09b175b1cd..2c5eff2410 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -20,11 +20,14 @@ function sourceFile() { cd "${PBS_O_WORKDIR}" || exit HERE=$(python3 -c 'import os; print(os.getcwd())') export HERE + # ----[1. Assert `./pretrain_gpt_alcf.py` exists:]----------------------------- export EXEC="${HERE}/pretrain_gpt_alcf.py" [ -f "${EXEC}" ] || exit + # ----[2. `source ./ALCF/helpers_alcf.sh`:]------------------------------------ sourceFile "${HERE}/ALCF/helpers.sh" || exit + # ----[3. Call fns from `./ALCF/helpers_alcf.sh`]------------------------------ setEnv || exit # 1. load `conda` environment saveDSenv || exit # 2. save env vars to `.deepspeed_env` @@ -81,16 +84,20 @@ data_cache_path="${CKPT_DIR}/${DATA_CACHE_PATH}" mkdir -p "${data_cache_path}" if [[ -n "${DIST_LAUNCH}" ]]; then - LAUNCHER="${DIST_LAUNCH} python3 ${EXEC}" + LAUNCHER="${DIST_LAUNCH} python3 -Wignore ${EXEC}" else LAUNCHER="deepspeed --hostfile $hfds --launcher MPICH ${EXEC}" fi -rstr=$(printRed "Launching with:") -gstr=$(printGreen " ${LAUNCHER}") -printf "%s %s\n" "${rstr}" "${gstr}" +rstr=$(printRed "Launching with:\n") +mstr=$(printBlue " ${LAUNCHER}") +printf "%s" "${rstr}'" +printf " %s" "${mstr}" +# printf "%s %s\n" "${rstr}" "${mstr}" # ${DIST_LAUNCH} python3 ${EXEC} \ + # --log-num-zeros-in-grad \ + # --log-memory-to-tensorboard \ run_cmd=" ${LAUNCHER} \ --$DTYPE \ @@ -130,6 +137,9 @@ run_cmd=" --data-cache-path ${data_cache_path} \ --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ --tokenizer-model ${TOKENIZER_MODEL} \ + --timing-log-level ${TIMING_LOG_LEVEL} \ + --log-timers-to-tensorboard \ + --log-optimizer-states-to-tensorboard \ ${LLAMA_ARGS} \ $ds_args \ ${gpt_args[*]} \ From 42140d7bd0af08ff718db69c10b34c8af8ba039c Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 23 Apr 2024 23:02:24 -0500 Subject: [PATCH 197/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index be04c08201..06fadf74f3 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -117,6 +117,7 @@ setParams() { export TRAIN_ITER=${TRAIN_ITER:-317892} export EVAL_INTERVAL="${EVAL_INTERVAL:-50000}" export SAVE_INTERVAL=${SAVE_INTERVAL:-200} + export TIMING_LOG_LEVEL="${TIMING_LOG_LEVEL:-1}" export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} export GLOBAL_BATCH_MAX=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) export GLOBAL_BATCH="${GLOBAL_BATCH:-${GLOBAL_BATCH_MAX}}" From fa0c5a66272a2dbb3295a0540c7c544e9e334640 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 13:06:48 -0500 Subject: [PATCH 198/268] Update `ALCF/helpers.sh`, `train_llama_alcf.sh` --- ALCF/helpers.sh | 57 +++++++++++++++++++++++++++++------------ train_llama_alcf.sh | 62 +++++++++++++++------------------------------ 2 files changed, 61 insertions(+), 58 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 06fadf74f3..8e9e508087 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -36,6 +36,19 @@ function setupSrun() { } +function setupLauncher() { + # outdir=$1 + if [[ -n "${DIST_LAUNCH}" && ${LAUNCH_CMD:-"MPICH"} != "deepspeed" ]]; then + export LAUNCH_CMD="${DIST_LAUNCH} python3 -Wignore ${EXEC}" + else + # Assert `./hostfile_deepspeed` exists + export hfds="${WORKING_DIR}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit + export LAUNCH_CMD="deepspeed --hostfile $hfds --launcher MPICH ${EXEC}" + fi + printf "%s" "$(printRed 'Launching with:')" + printf " %s" "$(printMagenta ${LAUNCH_CMD})" +} + function setDSlauncher() { # launcher setting outdir=$1 @@ -159,14 +172,27 @@ setArgs() { export gpt_args } + +function make_ds_hostfile() { + export GPUS_PER_NODE="${GPUS_PER_NODE:-${NGPU_PER_HOST:-${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}}}" + # ---- Make MPICH hostfile ---------------- + hf="${HOSTFILE:-${PBS_NODEFILE}}" + export hostfile_mpich=hostfile_mpich + cat "${hf}" > "${hostfile_mpich}" + # ---- Make DeepSpeed hostfile ------------------- + export hostfile_deepspeed=hostfile_deepspeed + cat "${hf}" > "${hostfile_deepspeed}" + sed -e "s/$/ slots=${GPUS_PER_NODE}/" -i "${hostfile_deepspeed}" +} + # +---------------------------------------+ # | 1. Git clone ezpz (if not found) | # | 2. Install ezpz (if not installed) | # +---------------------------------------+ ezpz() { - if [[ ! -d "${PBS_O_WORKDIR}/deps/ezpz" ]]; then - mkdir -p "${PBS_O_WORKDIR}/deps" - git clone https://github.com/saforem2/ezpz "${PBS_O_WORKDIR}/deps" + if [[ ! -d "${WORKING_DIR}/deps/ezpz" ]]; then + mkdir -p "${WORKING_DIR}/deps" + git clone https://github.com/saforem2/ezpz "${WORKING_DIR}/deps" else echo "Found ezpz!" fi @@ -175,11 +201,12 @@ ezpz() { else echo "Does not have ezpz installed. Installing..." echo "Using $(which python3) to install ezpz:" - python3 -m pip install -e "${PBS_O_WORKDIR}/edps/ezpz" # > ezpz-install.log 2>&1 + python3 -m pip install -e "${WORKING_DIR}/edps/ezpz" # > ezpz-install.log 2>&1 fi echo "Done with ezpz." source "${WORKING_DIR}/deps/ezpz/src/ezpz/bin/savejobenv" > /tmp/savejobenv.log 2>&1 || exit source "${WORKING_DIR}/deps/ezpz/src/ezpz/bin/getjobenv" || exit + make_ds_hostfile || exit } # +------------------------------------------------------------------------+ @@ -201,7 +228,8 @@ saveDSenv() { setOutput() { # ---- Specify output location -------------------------------- export OUTPUT_PREFIX="ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_opt${OPT}" - OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" + # OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" + OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%Y%m%d-%H%M%S)_${WORLD_SIZE}_${HOSTNAME}" export OUTPUT_DIR="${OUTPUT_DIR}" export OUTPUT_LOG="${OUTPUT_DIR}/output.log" export CKPT_DIR="checkpoints/${OUTPUT_PREFIX}" @@ -275,18 +303,15 @@ setEnv() { echo "[python] Using: $(which python3)" } + makeHostfiles() { - # source "${WORKING_DIR}/deps/ezpz/src/ezpz/bin/savejobenv" || exit #> /tmp/savejobenv.log 2>&1 & - # source "${WORKING_DIR}/deps/ezpz/src/ezpz/bin/getjobenv" || exit - export GPUS_PER_NODE="${GPUS_PER_NODE:-${NGPU_PER_HOST:-${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}}}" - # ---- Make MPICH hostfile ---------------- - hf="${HOSTFILE:-${PBS_NODEFILE}}" - export hostfile_mpich=hostfile_mpich - cat "${hf}" > "${hostfile_mpich}" - # ---- Make DeepSpeed hostfile ------------------- - export hostfile_deepspeed=hostfile_deepspeed - cat "${hf}" > "${hostfile_deepspeed}" - sed -e "s/$/ slots=${GPUS_PER_NODE}/" -i "${hostfile_deepspeed}" + if [[ -n "${HOSTFILE}" ]]; then + printf "!! USING CUSTOM HOSTFILE FROM: %s" "${HOSTFILE}" + else + make_ds_hostfile + # source "${WORKING_DIR}/deps/ezpz/src/ezpz/bin/savejobenv" || exit #> /tmp/savejobenv.log 2>&1 & + # source "${WORKING_DIR}/deps/ezpz/src/ezpz/bin/getjobenv" || exit + fi } setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------------- diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index 2c5eff2410..af689ab235 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -30,29 +30,24 @@ sourceFile "${HERE}/ALCF/helpers.sh" || exit # ----[3. Call fns from `./ALCF/helpers_alcf.sh`]------------------------------ setEnv || exit # 1. load `conda` environment -saveDSenv || exit # 2. save env vars to `.deepspeed_env` +# saveDSenv || exit # 2. save env vars to `.deepspeed_env` ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars -if [[ -z "${HOSTFILE}" ]]; then - makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` -else - echo "!! USING CUSTOM HOSTFILE FROM: ${HOSTFILE}" -fi + +# if [[ -z "${HOSTFILE}" ]]; then +# makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` +# else +# echo "!! USING CUSTOM HOSTFILE FROM: ${HOSTFILE}" +# fi setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} setArgs || exit # 8. specify additional `deepspeed` arguments setData "${DATA_FILE_LIST}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset -setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` +# setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` printJobInfo || exit # 11. print job info +setupLauncher || exit # ----------------------------------------------------------------------------- -# Take custom args -custom_args=" $@" - -# Assert `./hostfile_deepspeed` exists -export hfds="${HERE}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit -TBDIR="${CKPT_DIR}/tensorboard" -mkdir -p "${TBDIR}" # TORCH_DEVICE=$(python3 -c 'import ezpz as ez; print(ez.get_torch_device())') # printf %s "Using TORCH_DEVICE=${TORCH_DEVICE}" @@ -63,44 +58,27 @@ mkdir -p "${TBDIR}" # fi -# source "${HERE}/venvs/polaris/2024-03-14/bin/activate" || exit -# echo "Using $(which python3)" -# --launcher_args='--pmi=pmix' -# deepspeed --hostfile $hfds --launcher ${LAUNCHER} ${EXEC} \ -# ${launch_cmd} \ -# --use-flash-attn-v2 \ -# --num-workers 0 \ - - # aprun -n "${NGPUS}" -N "${NGPU_PER_HOST}" --pmi=pmix ${PBS_O_WORKDIR}/local_rank.sh -# yeet="${DIST_LAUNCH} ./local_rank.sh" - # deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ - # # export MPICH_GPU_SUPPORT_ENABLED=1 # export CUDA_DEVICE_MAX_CONNECTIONS=1 # export NCCL_DEBUG=INFO - # deepspeed --hostfile $hfds --launcher MPICH ${EXEC} \ - # +# +# +# Assert TBDIR exists inside our $CKPT_DIR +TBDIR="${CKPT_DIR}/tensorboard" +mkdir -p "${TBDIR}" + data_cache_path="${CKPT_DIR}/${DATA_CACHE_PATH}" mkdir -p "${data_cache_path}" +module list -if [[ -n "${DIST_LAUNCH}" ]]; then - LAUNCHER="${DIST_LAUNCH} python3 -Wignore ${EXEC}" -else - LAUNCHER="deepspeed --hostfile $hfds --launcher MPICH ${EXEC}" -fi - -rstr=$(printRed "Launching with:\n") -mstr=$(printBlue " ${LAUNCHER}") -printf "%s" "${rstr}'" -printf " %s" "${mstr}" -# printf "%s %s\n" "${rstr}" "${mstr}" +# Take custom args +custom_args=" $@" - # ${DIST_LAUNCH} python3 ${EXEC} \ # --log-num-zeros-in-grad \ # --log-memory-to-tensorboard \ run_cmd=" - ${LAUNCHER} \ - --$DTYPE \ + ${LAUNCH_CMD} \ + --${DTYPE} \ --optimizer ${OPT} \ --split 100,0,0 \ --log-interval 1 \ From 4b9c2f2c966664e0f67a4840b3d88ab6691c76d1 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 13:11:22 -0500 Subject: [PATCH 199/268] Add `ALCF/sunspot-env.sh` --- ALCF/sunspot-env.sh | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 ALCF/sunspot-env.sh diff --git a/ALCF/sunspot-env.sh b/ALCF/sunspot-env.sh new file mode 100644 index 0000000000..e97545aafe --- /dev/null +++ b/ALCF/sunspot-env.sh @@ -0,0 +1,27 @@ +#!/bin/bash --login + +export CCL_OP_SYNC=1 # Required by current oneCCL (HPCS-8067) +export CCL_PROCESS_LAUNCHER=pmix # Required by Aurora mpich +export FI_PROVIDER=cxi # Required by Aurora mpich +export PALS_PMI=pmix # Required by Aurora mpich +export CCL_ATL_TRANSPORT=mpi # Required by Aurora mpich +export FI_MR_CACHE_MONITOR=disabled # Required by Aurora mpich (HPCS-6501) +export CCL_SKIP_SCHEDULER=1 # Required by current oneCCL, will remove when set by default +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export CCL_TOPO_COLOR="card:{0,1},{2,3},{4,5},{6,7},{8,9},{10,11};plane:{0,3,4,6,8,11},{1,2,5,7,9,10}" +export UR_L0_IN_ORDER_BARRIER_BY_SIGNAL=0 # Required by current oneCCL + + +export LLM_DK_DIR=/home/$(whoami)/q4-drop_sunspot/llm.devkit + +module load oneapi/release/2023.12.15.001 +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE + + +module use /home/ftartagl/graphics-compute-runtime/modulefiles +module load graphics-compute-runtime/agama-ci-devel-736.9 +source /home/$(whoami)/q4-drop_sunspot/llm.devkit/torch-ccl/third_party/oneCCL/build/_install/env/vars.sh +module load gcc/12.1.0 +module unload intel_compute_runtime/release/agama-devel-647 From c2e9147546e03cfdb0afe85e6ed8317eecd7ea66 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 13:34:10 -0500 Subject: [PATCH 200/268] Update `train_llama_alcf.sh`, `ALCF/helpers.sh` --- ALCF/helpers.sh | 53 ++++++++++++++++++++++++++++----------------- train_llama_alcf.sh | 5 ++++- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 8e9e508087..2e34e6c425 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -1,11 +1,15 @@ #!/bin/bash --login +# +# set -euxo pipefail if [[ -n "${PBS_O_WORKDIR}" ]]; then WORKING_DIR="${PBS_O_WORKDIR}" elif [[ -n "${SLURM_SUBMIT_DIR}" ]]; then WORKING_DIR="${SLURM_SUBMIT_DIR}" else - WORKING_DIR="$(realpath $(pwd))" + echo "Unable to detect PBS or SLURM working directory info..." + WORKING_DIR=$(python3 -c 'import os; print(os.getcwd())') + echo "Using ${WORKING_DIR} as working directory..." fi export WORKING_DIR="${WORKING_DIR}" @@ -14,7 +18,7 @@ printf "Using WORKING_DIR: %s\n" ${WORKING_DIR} printJobInfo() { echo "++++++++++++++++++++++++++++++++++++++++++++++++++" - echo "- MPICH_DIR=$MPICH_DIR" + echo "- MPICH_DIR=${MPICH_DIR:-${MPI_ROOT}}" echo "- Using $(which python3)" echo "- WORLD_SIZE:${WORLD_SIZE}" echo "- NCCL: ${NCCL:-nccl}" @@ -68,6 +72,7 @@ setParams() { LLAMA_ARGS="" # ---- [Parallelism Settings] -------------------------------------------- # -------- [Aurora] ---- || ----- [SunSpot] ------------ + NO_FLASH_ATTN="${NO_FLASH_ATTN:-0}" if [[ $(hostname) == x4* || $(hostname) == x1* ]]; then TP=${TP:-1} # TP = 1 export CCL=${CCL:-ccl} # CCL @@ -75,7 +80,7 @@ setParams() { export DTYPE=${DTYPE:-bf16} # DTYPE: bf16 MICRO_BATCH=${MICRO_BATCH:-4} # MICRO_BATCH = 4 # export WORKING_DIR="${PBS_O_WORKDIR}" - if [[ -z "${NO_FLASH_ATTN}" ]]; then + if [[ "${NO_FLASH_ATTN}" != 0 ]]; then LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn" fi # -------- [Polaris] ----------------------------------- @@ -87,7 +92,8 @@ setParams() { export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 MICRO_BATCH=${MICRO_BATCH:-8} # MICRO_BATCH = 8 # export WORKING_DIR="${PBS_O_WORKDIR}" - if [[ -z "${NO_FLASH_ATTN}" ]]; then + # if [[ -z "${NO_FLASH_ATTN}" ]]; then + if [[ "${NO_FLASH_ATTN}" != 0 ]]; then LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn-v2" fi # -------- [Perlmutter] --------------------------------- @@ -98,7 +104,8 @@ setParams() { export DTYPE="${DTYPE:-bf16}" MICRO_BATCH="${MICRO_BATCH:-8}" # export WORKING_DIR="${SLURM_SUBMIT_DIR}" - if [[ -z "${NO_FLASH_ATTN}" ]]; then + # if [[ -z "${NO_FLASH_ATTN}" ]]; then + if [[ "${NO_FLASH_ATTN}" != 0 ]]; then LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn-v2" fi fi @@ -192,20 +199,21 @@ function make_ds_hostfile() { ezpz() { if [[ ! -d "${WORKING_DIR}/deps/ezpz" ]]; then mkdir -p "${WORKING_DIR}/deps" - git clone https://github.com/saforem2/ezpz "${WORKING_DIR}/deps" + git clone https://github.com/saforem2/ezpz "${WORKING_DIR}/deps/ezpz" else echo "Found ezpz!" fi + echo "Done with clone. Now, checking if ezpz is installed..." if python3 -c 'import ezpz; print(ezpz.__file__)' 2> '/dev/null'; then echo "Has ezpz installed. Nothing to do." else echo "Does not have ezpz installed. Installing..." echo "Using $(which python3) to install ezpz:" - python3 -m pip install -e "${WORKING_DIR}/edps/ezpz" # > ezpz-install.log 2>&1 + python3 -m pip install -e "${WORKING_DIR}/deps/ezpz" # > ezpz-install.log 2>&1 fi echo "Done with ezpz." - source "${WORKING_DIR}/deps/ezpz/src/ezpz/bin/savejobenv" > /tmp/savejobenv.log 2>&1 || exit - source "${WORKING_DIR}/deps/ezpz/src/ezpz/bin/getjobenv" || exit + source ${WORKING_DIR}/deps/ezpz/src/ezpz/bin/savejobenv # > /dev/null 2>&1 #> /tmp/savejobenv.log 2>&1 || exit + source ${WORKING_DIR}/deps/ezpz/src/ezpz/bin/getjobenv || exit make_ds_hostfile || exit } @@ -240,10 +248,11 @@ setOutput() { buildDSconfig() { # ---- Build DeepSpeed Config --------------------------------- + export CPU_OPTIMIZER="${CPU_OPTIMIZER:-0}" export DS_CONFIG="${WORKING_DIR}/ds-configs/ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" mkdir -p $(dirname "${DS_CONFIG}") echo "DS_CONFIG: ${DS_CONFIG}" - printf "ZS: %s, CPU_OPTIMIZER: %s, MB: %s, GB: %s, PP: %s, DTYPE: %s" "${ZERO_STAGE}" "${CPU_OPTIMIZER}" "${MICRO_BATCH}" "${GLOBAL_BATCH}" "${PP}" "${DTYPE}" + printf "ZS: %s, , MB: %s, GB: %s, PP: %s, DTYPE: %s" "${ZERO_STAGE}" "${CPU_OPTIMIZER}" "${MICRO_BATCH}" "${GLOBAL_BATCH}" "${PP}" "${DTYPE}" # working_dir="${PBS_O_WORKDIR:-${SLURM_SUBMIT_DIR:-$(pwd)}}" generateDSconfig "${DS_CONFIG}" # bash "${WORKING_DIR}/ALCF/generate_ds_config.sh" "${DS_CONFIG}" @@ -269,16 +278,19 @@ sumFiles() { setEnv() { # ---- [SunSpot] ------- || ---- [Aurora] -------------- if [[ $(hostname) == x1* || $(hostname) == x4* ]]; then - PBS_PARENT=$(dirname ${PBS_O_WORKDIR}) - echo "Sourcing ${PBS_PARENT}/setenv.sh..." - source "${PBS_PARENT}/setenv.sh" || exit + # PBS_PARENT=$(dirname ${PBS_O_WORKDIR}) + # echo "Sourcing ${PBS_PARENT}/setenv.sh..." + # source "${PBS_PARENT}/setenv.sh" || exit + source "${WORKING_DIR}/ALCF/sunspot-env.sh" || exit # ----- [Aurora] ----------------------------------- - if [[ $(hostname) == x4* ]]; then - eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate anl_release_q4v2 - # ----- [SunSpot] ---------------------------------- - elif [[ $(hostname) == x1* ]]; then - echo "Running on SunSpot !!" - eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate q4-drop + if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then + if [[ $(hostname) == x4* ]]; then + eval "$(conda shell.zsh hook)" && conda activate anl_release_q4v2 + # ----- [SunSpot] ---------------------------------- + elif [[ $(hostname) == x1* ]]; then + echo "Running on SunSpot !!" + eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate q4-drop + fi fi # ----- [Polaris] --------------------------------------- elif [[ $(hostname) == x3* ]]; then @@ -460,7 +472,8 @@ generateDSconfig() { }," # elif [[ $ZERO_STAGE == 2 ]]; then elif [ "${ZERO_STAGE}" == 2 ] || [ "${ZERO_STAGE}" == 1 ]; then - if [[ -n "${CPU_OPTIMIZER}" ]]; then + # if [[ -n "${CPU_OPTIMIZER}" ]]; then + if [[ "${CPU_OPTIMIZER}" != 0 ]]; then echo "!!!! CAUGHT CPU_OPTIMIZER !!!!" zero="\ \"zero_optimization\": { diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index af689ab235..7731df52dd 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -5,6 +5,8 @@ #PBS -l select=48 #PBS -l filesystems=eagle:home +set -euxo pipefail + function sourceFile() { fp="$1" echo "source-ing ${fp}" @@ -125,7 +127,8 @@ run_cmd=" |& tee ${OUTPUT_LOG} " -echo "! Using $(which deepspeed)" +# ds_exec +# echo "! Using $(which deepspeed)" ds_report echo "${run_cmd}" From 41a3f3523a29a18462d926bbec5cd86c278bfedf Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 14:10:04 -0500 Subject: [PATCH 201/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 2e34e6c425..d0716739af 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -43,7 +43,7 @@ function setupSrun() { function setupLauncher() { # outdir=$1 if [[ -n "${DIST_LAUNCH}" && ${LAUNCH_CMD:-"MPICH"} != "deepspeed" ]]; then - export LAUNCH_CMD="${DIST_LAUNCH} python3 -Wignore ${EXEC}" + export LAUNCH_CMD="${DIST_LAUNCH} --cpu-bind depth -d 16 python3 -Wignore ${EXEC}" else # Assert `./hostfile_deepspeed` exists export hfds="${WORKING_DIR}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit From 71c725ec9aab266d25963fa0d8a37d15c582d7f2 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 14:17:00 -0500 Subject: [PATCH 202/268] Much faster check if `ezpz` installed Replace: ```bash if python3 -c 'import ezpz; print(ezpz.__file__)' 2> '/dev/null'; then ``` with ```bash if python3 -c "import sys; any(['ezpz' in s for s in sys.path])" 2> '/dev/null'; then ``` in `ezpz()` from `ALCF/helpers.sh` --- ALCF/helpers.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index d0716739af..a097d72efc 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -204,7 +204,8 @@ ezpz() { echo "Found ezpz!" fi echo "Done with clone. Now, checking if ezpz is installed..." - if python3 -c 'import ezpz; print(ezpz.__file__)' 2> '/dev/null'; then + # if python3 -c 'import ezpz; print(ezpz.__file__)' 2> '/dev/null'; then + if python3 -c "import sys; any(['ezpz' in s for s in sys.path])" 2> '/dev/null'; then echo "Has ezpz installed. Nothing to do." else echo "Does not have ezpz installed. Installing..." From ae0b4d8b40aec3d43c1842442166829463563d15 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 15:30:50 -0500 Subject: [PATCH 203/268] Add option to run in `DEBUG` mode (i.e. `set -euxo pipefail`) --- ALCF/helpers.sh | 84 ++++++++++++++++++++++++--------------------- train_llama_alcf.sh | 5 ++- 2 files changed, 49 insertions(+), 40 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index a097d72efc..fdd42b56b8 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -28,7 +28,7 @@ printJobInfo() { } -function setupSrun() { +setupSrun() { if [[ $(hostname) == login* || $(hostname) == nid* ]]; then export NHOSTS="${SLURM_NNODES:-1}" export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" @@ -40,7 +40,7 @@ function setupSrun() { } -function setupLauncher() { +setupLauncher() { # outdir=$1 if [[ -n "${DIST_LAUNCH}" && ${LAUNCH_CMD:-"MPICH"} != "deepspeed" ]]; then export LAUNCH_CMD="${DIST_LAUNCH} --cpu-bind depth -d 16 python3 -Wignore ${EXEC}" @@ -53,7 +53,7 @@ function setupLauncher() { printf " %s" "$(printMagenta ${LAUNCH_CMD})" } -function setDSlauncher() { +setDSlauncher() { # launcher setting outdir=$1 export hfds="$outdir/hostfile_deepspeed" @@ -70,20 +70,23 @@ function setDSlauncher() { setParams() { LLAMA_ARGS="" - # ---- [Parallelism Settings] -------------------------------------------- - # -------- [Aurora] ---- || ----- [SunSpot] ------------ - NO_FLASH_ATTN="${NO_FLASH_ATTN:-0}" + # +----[Parallelism Settings] -------------------------------------------+ + # +------[Aurora]--------||-------[SunSpot]-------------+ if [[ $(hostname) == x4* || $(hostname) == x1* ]]; then TP=${TP:-1} # TP = 1 export CCL=${CCL:-ccl} # CCL export BE="${CCL}" # BE = CCL export DTYPE=${DTYPE:-bf16} # DTYPE: bf16 MICRO_BATCH=${MICRO_BATCH:-4} # MICRO_BATCH = 4 - # export WORKING_DIR="${PBS_O_WORKDIR}" - if [[ "${NO_FLASH_ATTN}" != 0 ]]; then - LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn" + ####################################################### + # if NO_FLASH_ATTN is NON-empty; then NO FLASH ATTN !! + if [[ -n "${NO_FLASH_ATTN-}" ]]; then + echo "Not using flash-attn!!" + else + LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn-builder" fi - # -------- [Polaris] ----------------------------------- + ####################################################### + # +--------[Polaris]-----------------------------------+ elif [[ $(hostname) == x3* ]]; then TP=${TP:-2} # TP = 2 export NCCL=${NCCL:-nccl} # NCCL @@ -91,61 +94,64 @@ setParams() { # export DTYPE=${DTYPE:-bf16} # DTYPE: BF16 ?? export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 MICRO_BATCH=${MICRO_BATCH:-8} # MICRO_BATCH = 8 - # export WORKING_DIR="${PBS_O_WORKDIR}" - # if [[ -z "${NO_FLASH_ATTN}" ]]; then - if [[ "${NO_FLASH_ATTN}" != 0 ]]; then + if [[ -n "${NO_FLASH_ATTN-}" ]]; then + echo "Not using flash-attn!!" + else LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn-v2" fi - # -------- [Perlmutter] --------------------------------- + # +--------[Perlmutter]---------------------------------+ elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then TP="${TP:-2}" export NCCL="${NCCL:-nccl}" export BE="${NCCL}" export DTYPE="${DTYPE:-bf16}" MICRO_BATCH="${MICRO_BATCH:-8}" - # export WORKING_DIR="${SLURM_SUBMIT_DIR}" - # if [[ -z "${NO_FLASH_ATTN}" ]]; then - if [[ "${NO_FLASH_ATTN}" != 0 ]]; then + if [[ -n "${NO_FLASH_ATTN-}" ]]; then + echo "Not using flash-attn!!" + else LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn-v2" fi fi - # ------------------------------------------------------------------------ + # +----------------------------------------------------------------------+ export TP="${TP}" export PP="${PP:-1}" export DTYPE="${DTYPE:-bf16}" export OPT="${OPT:-adamw}" export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" NHOSTS=$(wc -l < "${HOSTFILE}") - NGPU_PER_HOST=$(python3 -c 'import ezpz as ez; print(ez.get_gpus_per_node())') + if [[ -z "${NGPU_PER_HOST-}" ]]; then + NGPU_PER_HOST=$(python3 -c 'import ezpz as ez; print(ez.get_gpus_per_node())') + fi export WORLD_SIZE="${WORLD_SIZE:-$(( NHOSTS * NGPU_PER_HOST ))}" # export WORLD_SIZE="${WORLD_SIZE:-${NGPUS:-$(( ))}}" # export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} - # ---- Llama2 7B Config ------------------------------ + # +---[Llama2 7B Config]-----------------------------+ export MODEL_KEY="Llama-7B" export HEADS=${HEADS:-${NHEADS:-32}} export NLAYERS=${NLAYERS:-${NUM_LAYERS:-32}} export HIDDEN=${HIDDEN:-4096} export NUM_KV_HEAD=${NUM_KV_HEAD:-8} export FFN_HIDDEN_SIZE=${FFN_HIDDEN_SIZE:-11008} - # ---- Run Settings ---------------------------------- - export LR=${LR:-0.0003} + # +---[Run Settings]------------------------------------------------------+ + export LR=${LR:-0.0003} # LEARNING_RATE export SEQ=${SEQ:-4096} # SEQ_LEN: 4096 - export ZERO_STAGE=${ZERO_STAGE:-2} - export MICRO_BATCH=${MICRO_BATCH:-8} - export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} - export EVAL_ITERS="${EVAL_ITERS:-10}" - export TRAIN_ITER=${TRAIN_ITER:-317892} - export EVAL_INTERVAL="${EVAL_INTERVAL:-50000}" - export SAVE_INTERVAL=${SAVE_INTERVAL:-200} - export TIMING_LOG_LEVEL="${TIMING_LOG_LEVEL:-1}" - export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} - export GLOBAL_BATCH_MAX=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) - export GLOBAL_BATCH="${GLOBAL_BATCH:-${GLOBAL_BATCH_MAX}}" - tm="${WORKING_DIR}/ALCF/tokenizer.model" - export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" - export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" + export ZERO_STAGE=${ZERO_STAGE:-2} # ZERO OFFLOADING STAGE + export MICRO_BATCH=${MICRO_BATCH:-8} # MICRO BATCH SIZE + export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} # GRADIENT ACCUMULATION STEPS + export EVAL_ITERS="${EVAL_ITERS:-10}" # NUMBER OF EVAL ITERS TO RUN + export TRAIN_ITER=${TRAIN_ITER:-317892} # NUMBER OF TRAIN ITERS + export EVAL_INTERVAL="${EVAL_INTERVAL:-50000}" # HOW FREQUENTLY TO RUN EVAL + export SAVE_INTERVAL=${SAVE_INTERVAL:-200} # HOW FREQUENTLY TO SAVE CKPTS + export TIMING_LOG_LEVEL="${TIMING_LOG_LEVEL:-1}" # TIMING VERBOSITY IN LOGS + export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} # USE ACTIVATION CHECKPOINTING ? + export GLOBAL_BATCH_MAX=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) # MAX GLOBAL BATCH SIZE + export GLOBAL_BATCH="${GLOBAL_BATCH:-${GLOBAL_BATCH_MAX}}" # WILL USE MAX IF NOT SET IN ENVIRONMENT + tm="${WORKING_DIR}/ALCF/tokenizer.model" # fallback: Megatron-DeepSpeed/ALCF/tokenizer.model + export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" # USE TOKENIZER_MODEL from env, else fallback from ^ + export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" # STRING FOR IDENTIFYING MODEL + # +----[ADDITIONAL LLAMA SPECIFIC ARGUMENTS]------------------------------ export LLAMA_ARGS="${LLAMA_ARGS} --no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" - # ---------------------------------------------------- + # +----------------------------------------------------------------------+ } @@ -180,7 +186,7 @@ setArgs() { } -function make_ds_hostfile() { +make_ds_hostfile() { export GPUS_PER_NODE="${GPUS_PER_NODE:-${NGPU_PER_HOST:-${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}}}" # ---- Make MPICH hostfile ---------------- hf="${HOSTFILE:-${PBS_NODEFILE}}" @@ -213,7 +219,7 @@ ezpz() { python3 -m pip install -e "${WORKING_DIR}/deps/ezpz" # > ezpz-install.log 2>&1 fi echo "Done with ezpz." - source ${WORKING_DIR}/deps/ezpz/src/ezpz/bin/savejobenv # > /dev/null 2>&1 #> /tmp/savejobenv.log 2>&1 || exit + source ${WORKING_DIR}/deps/ezpz/src/ezpz/bin/savejobenv > /dev/null 2>&1 #> /tmp/savejobenv.log 2>&1 || exit source ${WORKING_DIR}/deps/ezpz/src/ezpz/bin/getjobenv || exit make_ds_hostfile || exit } diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index 7731df52dd..4aac1153c7 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -5,7 +5,10 @@ #PBS -l select=48 #PBS -l filesystems=eagle:home -set -euxo pipefail +if [[ -n "${DEBUG-}" ]]; then + printf "\e[1;31m%s\e[0m\n" "!! RUNNING IN DEBUG MODE !!" + set -euxo pipefail +fi function sourceFile() { fp="$1" From 2d6608a9260cf36ac844d87efd7a4e73b08398de Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 17:00:23 -0500 Subject: [PATCH 204/268] Update `ALCF/data-lists/sunspot/*.txt` --- ALCF/data-lists/sunspot/algebraic.txt | 16 + ALCF/data-lists/sunspot/arxiv.txt | 100 + ALCF/data-lists/sunspot/books.txt | 3 + ALCF/data-lists/sunspot/c4.txt | 171 + ALCF/data-lists/sunspot/cc.txt | 1108 +++++++ .../sunspot/data_file_list_books.txt | 3 - ALCF/data-lists/sunspot/data_file_list_c4.txt | 86 - ALCF/data-lists/sunspot/data_file_list_cc.txt | 2880 ----------------- .../sunspot/data_file_list_peS2o.txt | 26 - .../sunspot/data_file_list_reddit.txt | 78 - .../sunspot/data_file_list_stack.txt | 149 - .../sunspot/data_file_list_wiki.txt | 2 - ALCF/data-lists/sunspot/falcon.txt | 501 +++ ALCF/data-lists/sunspot/megawiki.txt | 262 ++ .../sunspot/open-web-math-train.txt | 13 + ALCF/data-lists/sunspot/pes2o.txt | 26 + ALCF/data-lists/sunspot/reddit.txt | 78 + ALCF/data-lists/sunspot/stack.txt | 26 + ALCF/data-lists/sunspot/starcoder.txt | 50 + ALCF/data-lists/sunspot/tulu.txt | 66 + ALCF/data-lists/sunspot/wiki.txt | 2 + 21 files changed, 2422 insertions(+), 3224 deletions(-) create mode 100644 ALCF/data-lists/sunspot/algebraic.txt create mode 100644 ALCF/data-lists/sunspot/arxiv.txt create mode 100644 ALCF/data-lists/sunspot/books.txt create mode 100644 ALCF/data-lists/sunspot/c4.txt create mode 100644 ALCF/data-lists/sunspot/cc.txt delete mode 100644 ALCF/data-lists/sunspot/data_file_list_books.txt delete mode 100644 ALCF/data-lists/sunspot/data_file_list_c4.txt delete mode 100644 ALCF/data-lists/sunspot/data_file_list_cc.txt delete mode 100644 ALCF/data-lists/sunspot/data_file_list_peS2o.txt delete mode 100644 ALCF/data-lists/sunspot/data_file_list_reddit.txt delete mode 100644 ALCF/data-lists/sunspot/data_file_list_stack.txt delete mode 100644 ALCF/data-lists/sunspot/data_file_list_wiki.txt create mode 100644 ALCF/data-lists/sunspot/falcon.txt create mode 100644 ALCF/data-lists/sunspot/megawiki.txt create mode 100644 ALCF/data-lists/sunspot/open-web-math-train.txt create mode 100644 ALCF/data-lists/sunspot/pes2o.txt create mode 100644 ALCF/data-lists/sunspot/reddit.txt create mode 100644 ALCF/data-lists/sunspot/stack.txt create mode 100644 ALCF/data-lists/sunspot/starcoder.txt create mode 100644 ALCF/data-lists/sunspot/tulu.txt create mode 100644 ALCF/data-lists/sunspot/wiki.txt diff --git a/ALCF/data-lists/sunspot/algebraic.txt b/ALCF/data-lists/sunspot/algebraic.txt new file mode 100644 index 0000000000..0f25e30498 --- /dev/null +++ b/ALCF/data-lists/sunspot/algebraic.txt @@ -0,0 +1,16 @@ +0.0018520780893211373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document +0.0017591050606817512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document +0.001459052794333798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document +0.0007405667281569194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document +0.00019420030110896795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document +0.0009008668715801845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document +0.00015115827957143057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document +0.0014552844319220648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document +0.0012469861325685161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document +0.00136412011372413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document +0.0007064279699221103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document +0.0008472240000687427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document +0.0001984375713341955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document +0.0005472773881697123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document +0.001815779629850992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document +0.0018313600689757324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document diff --git a/ALCF/data-lists/sunspot/arxiv.txt b/ALCF/data-lists/sunspot/arxiv.txt new file mode 100644 index 0000000000..c50df90503 --- /dev/null +++ b/ALCF/data-lists/sunspot/arxiv.txt @@ -0,0 +1,100 @@ +0.0002583902668716813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document +0.0002646575141232155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document +0.0003165521247456758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document +0.0002920706460176214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document +0.00028396813182810215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document +0.00030445161883108107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document +0.00031628781276576474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document +0.0003083776568189157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document +0.0003176359471472902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document +0.0002536009369131698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document +0.0003067491424681363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document +0.0002597217257557784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document +0.0003788556450109768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document +0.0002796563272052598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document +0.00033573826524290287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document +0.00030523658022800287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document +0.00032211552192240096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document +0.0003329295675164247 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document +0.0003101982186639862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document +0.00032361798234223355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document +0.0003495541581652915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document +0.0002821637448858042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document +0.00030399523537629673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document +0.0002955658968247219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document +0.00028942158502924254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document +0.00028769546171490733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document +0.0002938111057234182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document +0.0002711150403010948 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document +0.00031130095874747565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document +0.0003002996118160777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document +0.0003732757901604459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document +0.00026784205751795894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document +0.0002799626521661984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document +0.00034334276069078164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document +0.0003582469803674965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document +0.00031094844818418623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document +0.0002766228384977191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document +0.00030297116159471485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document +0.00027033888377464685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document +0.00030090862368377933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document +0.00028543875802490955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document +0.00027559768459074204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document +0.0003182185533962886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document +0.0003311392971435837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document +0.00028751652060804325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document +0.000303466863212589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document +0.00033400462801277524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document +0.0002589234031777426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document +0.0002913508598466723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document +0.0002670572450004856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document +0.00032027399105647656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document +0.00032188376258379377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document +0.0003161585784100882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document +0.0003184249182974135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document +0.00030381336664000807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document +0.0003190437442184283 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document +0.0002537961798200545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document +0.0003017817117223326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document +0.00028685268513240224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document +0.00031265179094451165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document +0.00034708319096986816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document +0.00026650837943080664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document +0.00034588832248507335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document +0.0002416982248399037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document +0.0003089296918222243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document +0.00029137184185700827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document +0.00026464226846800774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document +0.00030545397919456627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document +0.0003206778460448875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document +0.00030968971641110967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document +0.00023325653928600864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document +0.00030526899198338555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document +0.00035376719076633584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document +0.000290224385981026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document +0.000294650083382008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document +0.00028768858128616436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document +0.00030856965235527843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document +0.00030579942447879054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document +0.0002863101084704357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document +0.0002870032092492213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document +0.000264182727569885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document +0.0002974012367036449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document +0.00032238412143059203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document +0.00031683716893819036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document +0.00031157434937617524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document +0.0003411742735695989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document +0.00026778444816570715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document +0.0003037045797275201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document +0.00027746114370081314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document +0.00027148285946862043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document +0.00028042950114678207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document +0.0003235607816590721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document +0.0003086692227306295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document +0.00033990349455148105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document +0.00030945053208470265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document +0.00027309074552265303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document +0.00028737393506316194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document +0.0003098868328009879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document +0.0002614229162588409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document +0.0002884388407820923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document diff --git a/ALCF/data-lists/sunspot/books.txt b/ALCF/data-lists/sunspot/books.txt new file mode 100644 index 0000000000..7aa37a00d2 --- /dev/null +++ b/ALCF/data-lists/sunspot/books.txt @@ -0,0 +1,3 @@ +0.0031025147279277244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document +0.003102019887362634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document +0.0009996745994661548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document diff --git a/ALCF/data-lists/sunspot/c4.txt b/ALCF/data-lists/sunspot/c4.txt new file mode 100644 index 0000000000..9504bcbfe6 --- /dev/null +++ b/ALCF/data-lists/sunspot/c4.txt @@ -0,0 +1,171 @@ +0.0002406272620255565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document +0.0002404825539493424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document +0.00024062296575435581 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document +0.00024069315766818953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document +0.00024055829162263452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document +0.00024062053397343032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document +0.0002410715545206964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document +0.00024024881846087368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document +0.0002407074700790688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document +0.00024072141428809043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document +0.00024027710230872736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document +0.0002409111299205489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document +0.00024081954058275009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document +0.00024086076794990912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document +0.00024098672620832446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document +0.00024068622303333862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document +0.00024140627024291824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document +0.0002414512033594384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document +0.00024028742594941463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document +0.00024018036089269645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document +0.0002398347365034979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document +0.00024006780153485276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document +0.00024015620270419213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document +0.0002408848259695227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document +0.0002408023185278831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document +0.00024021196580140326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document +0.00024077677271297493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document +0.00024087392454668027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document +0.0002408071293824126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document +0.00024042223828845715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document +0.0002411484752360495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document +0.00023605263746465907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document +0.00023471222158326908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document +0.00023432138580287644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document +0.00023407385623382327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document +0.00023487504174367091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document +0.0002341843704976313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document +0.00023421993170282486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document +0.00023445057969132037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document +0.0002337681680073047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document +0.000234627964808109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document +0.0002338942211888584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document +0.00023403849286843386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document +0.00023405641310796305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document +0.00023349169562397965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document +0.00023381157386048856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document +0.00023388742993790587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document +0.00023363103829469813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document +0.00023421141834630477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document +0.00023420564352232565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document +0.00023367463699173143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document +0.00023344969163567033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document +0.00023372196941547188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document +0.00023399207645297834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document +0.00023357915605505856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document +0.00023337585642190864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document +0.00023385005470157914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document +0.00023301533534493465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document +0.00023377864302541782 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document +0.00023323745848621437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document +0.0002330594611151835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document +0.0002334149675026783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document +0.00023198945902291534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document +0.00023023784834634142 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document +0.00022985623060187217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document +0.0002292605284569516 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document +0.00022926593333048894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document +0.00022922766406807777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document +0.00022898153911167426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document +0.0002292473111593315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document +0.000228804579400424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document +0.00022865485613513526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document +0.00022937426835887895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document +0.00022917388311587372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document +0.0002291660582019043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document +0.00022907895248360543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document +0.0002294617879920205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document +0.0002290452150516566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document +0.00022943405619715553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document +0.0002296271421006204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document +0.00022854791372910372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document +0.00022923123467686557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document +0.00022852404355738494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document +0.00022847798660086642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document +0.0002289604586810316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document +0.00022835479834950643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document +0.0002289149402884243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document +0.00022806655474763446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document +0.00022826296420992974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document +0.00022906829636213627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document +0.0002287628414466998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document +0.0002282673911253445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document +0.00022869309841939134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document +0.0002281540116815451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document +0.0002259755756162738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document +0.00022562331285233504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document +0.0002259061146106053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document +0.00022567670836663787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document +0.00022573165387587061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document +0.00022508514961670572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document +0.00022564642513773356 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document +0.00022563088621998788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document +0.0002250438755373707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document +0.00022524465346241134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document +0.00022531737657666812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document +0.00022444687519363458 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document +0.00022460397498596298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document +0.00022454218976501763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document +0.00022447528843671366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document +0.00022501666332178926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document +0.00022453752304377972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document +0.00022484451871163002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document +0.00022465678847154914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document +0.00022453180917044732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document +0.0002247278486823009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document +0.00022465794828242097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document +0.00022431000701925386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document +0.00022476020248460963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document +0.00022467531771795015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document +0.0002236391309945234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document +0.00022458764920536007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document +0.00022430877426744415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document +0.0002247047786127192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document +0.0002245298090400035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document +0.0002245648831396188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document +0.00022292894729820784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document +0.00022236668082957533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document +0.0002217622659895442 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document +0.00022252452726732609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document +0.00022135333211363678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document +0.0002214571757787971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document +0.0002217188139237798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document +0.00022144214894640303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document +0.00022100172806631854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document +0.00022156392409199052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document +0.00022134830143710272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document +0.00022158598922529453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document +0.00022142932483041377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document +0.00022120980907786554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document +0.00022117917738112441 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document +0.00022077089397851235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document +0.00022093265074996711 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document +0.00022091299741377004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document +0.0002205849150703338 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document +0.0002210648204787979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document +0.0002214235747364102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document +0.00022083907302221787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document +0.0002206334237915964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document +0.00022065193929912214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document +0.00022079775597767288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document +0.00022091492909963518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document +0.00022095009987097293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document +0.0002208150577180165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document +0.00022085759102772088 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document +0.00022073789170129016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document +0.00022049322781182384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document +0.00022083270617761285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document +0.00021982452827473632 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document +0.00021899870446514259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document +0.00021890358773356361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document +0.00021875556609042841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document +0.00021861195987201226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document +0.00021856782186167455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document +0.00021912837771543515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document +0.00021900213768517756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document +0.00021871675851390374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document +0.0002180537056545586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document +0.0002188196714327129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document +0.00021851362624523464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document +0.0002183236795498736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document +7.291153618675672e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document diff --git a/ALCF/data-lists/sunspot/cc.txt b/ALCF/data-lists/sunspot/cc.txt new file mode 100644 index 0000000000..0a2a0ce35b --- /dev/null +++ b/ALCF/data-lists/sunspot/cc.txt @@ -0,0 +1,1108 @@ +0.0003742481815405742 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document +0.00038204855962733055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document +0.00038821818392663593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document +0.00038723332988783727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document +0.00038916141142149904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document +0.00038049542523949033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document +0.0003854755539534284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document +0.00024202756466512517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document +0.0003915405155008087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document +0.0003927382151931033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document +0.0003839151202260479 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document +0.00040006817468967907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document +0.00040318965964443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document +0.0003831013019452741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document +0.00039166638383204036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document +0.00039962784023961004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document +0.00039536707853602614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document +0.0004204304698247758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document +0.00041538899178693555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document +0.00039186953333675306 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document +0.00038945837196504305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document +0.0003919951238929062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document +0.00044377065718528966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document +0.0004407759068603017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document +0.0002487811895843715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document +0.00039349432045556636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document +0.00041223198559462343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document +0.0004036573014830213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document +0.0003825982215521807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document +0.00040386867133151386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document +0.00024460575279105167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document +0.000269029789531335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document +0.0003573757493252864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document +0.0004600876681392076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document +0.0002605354166397086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document +0.0003882502452157999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document +0.0002466747612126512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document +0.0004024726105072402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document +0.00040820631128483644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document +0.0002691094350403538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document +0.00026916830387277267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document +0.0004204663297880574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document +0.00042379698687085554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document +0.0004502169227311871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document +0.0002661708937015295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document +0.00031239486948031334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document +0.0003109054589936201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document +0.00045873053079760646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document +0.00022904931423244635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document +0.0003813462028433663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document +0.00039188129256500874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document +0.00045124222276983765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document +0.00048138658436853695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document +0.0003944178776279866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document +0.00039941569676754006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document +0.00037952761190240494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document +0.0003944870860881476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document +0.0003891842411856621 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document +0.000387688981934861 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document +0.00039197953876258005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document +0.00039007915280311206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document +0.0003995520363699188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document +0.00039230985654592406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document +0.0003929472067173851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document +0.0003924096172671473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document +0.0003881636143629905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document +0.000389790617937084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document +0.00037351762309221023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document +0.0003630196170929407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document +0.00033532465765142113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document +0.0003076088685761823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document +0.00039463850897720803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document +0.0002843816115231449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document +0.0002909175709416474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document +0.00028867170997202486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document +0.0002838644617723659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document +0.00029027869525543416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document +0.0002821339567560056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document +0.0002922988877045601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document +0.0002866955958315786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document +0.0002865271754558126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document +0.0002861247475618473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document +0.0002826681072408606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document +0.0002849746458282827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document +0.0002816966633435316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document +0.00026255342235948463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document +0.0002552895098829678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document +0.00025990194083107813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document +0.0002524062657685835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document +0.0002538577379748611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document +0.0002561415177406761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document +0.00026206253059694905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document +0.00026168095406910565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document +0.0002601305742008613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document +0.00025200823006814814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document +0.0003229951981263502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document +0.00037289448266476045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document +0.0003807825862179898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document +0.0003616333738191483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document +0.0003665117918907636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document +0.0003684186453633228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document +0.0003589330610806066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document +0.00036383861418030395 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document +0.000359841363355303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document +0.00036431044063050464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document +0.0003668574090358279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document +0.000362768263620199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document +0.0003501888032771077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document +0.000352401968221528 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document +0.0003541019701869794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document +0.0003628121865546891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document +0.0003752582953758773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document +0.00037902046230424966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document +0.0003777927146925147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document +0.0003760676130509053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document +0.00034046049078755405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document +0.0003338847563259091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document +0.00033294499102761794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document +0.0004912026198265864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document +0.00032064363474664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document +0.00032154190389541214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document +0.00032309660151746207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document +0.00031181143365304544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document +0.00031046092294569104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document +0.00031150165249068046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document +0.0003041314265988224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document +0.0003024834909739394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document +0.0003019936835833604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document +0.000292329665283177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document +0.0002867061143144972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document +0.00028443615610701707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document +0.00028462291013755945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document +0.0002793538601205013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document +0.00027306573977044246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document +0.00027097155673336525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document +0.0002752934202112985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document +0.00043042012694697647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document +0.00047495648822986177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document +0.00047755032493473855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document +0.0004706974343933747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document +0.00046682163297771817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document +0.0004616765425874178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document +0.00030644496751628097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document +0.0002909492555358308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document +0.00027272036068261724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document +0.0004101070217315588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document +0.0003728914338834357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document +0.00036546911442305647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document +0.0003669945482407483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document +0.0003715902407424017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document +0.00035837486406683366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document +0.0003573318538685469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document +0.0003553784893071916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document +0.0004920659809912352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document +0.0004533619411303183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document +0.00045067066057818706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document +0.00044396985139270645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document +0.00043198288204468477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document +0.00043005174223738454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document +0.00041847118430776784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document +0.00042952036375796664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document +0.00043420594647324267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document +0.0003461123241053012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document +0.0003408581597849182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document +0.00033172705422182547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document +0.0003392566490686136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document +0.00033578341518385483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document +0.0003439196710518844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document +0.00034559163447085543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document +0.00033762478642902825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document +0.00033215210055107224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document +0.00033423579608014966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document +0.0004963355016025102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document +0.0004996862761456923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document +0.0005000551829325451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document +0.0005004212610098755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document +0.00027768695585500585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document +0.00028395983854338433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document +0.00027835826303062254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document +0.0002740073176010804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document +0.0002791830529274016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document +0.0002796863816194411 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document +0.00026697453022672804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document +0.0002594197440280141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document +0.0003779565697649222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document +0.00041835823476586606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document +0.00043788493575265915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document +0.0002731731970096006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document +0.000276305847423402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document +0.0002704955773958623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document +0.0002629635944827518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document +0.000260070956974436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document +0.00025661553791456334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document +0.00025794727207576157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document +0.00025295733980001527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document +0.0003788106407021029 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document +0.0004882344027669431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document +0.0003275324309642705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document +0.0004803401856640094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document +0.00046720138323433943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document +0.00043527810307095335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document +0.00043905395741627827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document +0.00048774175867331425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document +0.00048380704121346737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document +0.0004779011848346118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document +0.00046255587581908036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document +0.00045127922880511576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document +0.0004503891485256095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document +0.0004450142332303422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document +0.00044630282482516654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document +0.00044325014465743616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document +0.0004263874842796447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document +0.0004217530913646938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document +0.000415120314341852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document +0.00040987168279144537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document +0.00033468337266607834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document +0.0003353094464683005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document +0.0004833936821707294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document +0.00047194878988920935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document +0.0004648324126996427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document +0.0004562345003964941 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document +0.0004933203505465098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document +0.0003530166075325466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document +0.00035368548192804685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document +0.0004872620828289663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document +0.00048293889392426456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document +0.00047936768462267655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document +0.00047821013991587545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document +0.0004660610308564753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document +0.000394683430103437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document +0.00039165053441571324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document +0.0003906936040164381 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document +0.00038074803919159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document +0.0003686529291578143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document +0.00035832920428870976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document +0.00035929024535947033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document +0.0003538226556050544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document +0.0003584167868708799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document +0.0003480507542594234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document +0.0003413709023543034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document +0.00034001304759361455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document +0.00033430532902756514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document +0.00046519252660631277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document +0.0002938876402514769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document +0.00028676090994509047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document +0.00027296150117506716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document +0.00026513502621960483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document +0.0002680081327926125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document +0.00025831225828720344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document +0.00026647037295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document +0.0002525733734572654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document +0.00025831708887575375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document +0.00042487627444443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document +0.0004951213245023891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document +0.0004804051413177752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document +0.0004662397611340532 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document +0.0004550138655253933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document +0.00044494909122746795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document +0.0002899112253051385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document +0.0004372879736279761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document +0.0004529568099252922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document +0.00045127826158829573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document +0.0004436558176737439 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document +0.0004419233237678378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document +0.000434589215880319 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document +0.00029153613207706566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document +0.0004312458058738854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document +0.00028741854968757313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document +0.00046853200754421234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document +0.0004949145252030074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document +0.00044459683920483167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document +0.0003836095306696336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document +0.0003789760237872398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document +0.0003749227438304427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document +0.0003628558277173369 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document +0.00039468301394041474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document +0.00038874701821614864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document +0.0004158492456077867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document +0.00042360504554060077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document +0.00040386729844317623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document +0.00027595096702902474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document +0.00043638766787829135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document +0.0002218691596850179 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document +0.0004437566108089954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document +0.0003889996411609667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document +0.00043454421906537704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document +0.0004522564392830988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document +0.00041517835659357416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document +0.0002614360863446896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document +0.00037543522111463596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document +0.0004386190133514781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document +0.00046358333286115075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document +0.00043186261317942404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document +0.0002377581602097957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document +0.00025973334085074254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document +0.00040139099332000796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document +0.00043674860686687174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document +0.00040853289309329373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document +0.000242910191729688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document +0.0004431071731750582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document +0.0004388092670482523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document +0.000381418866255965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document +0.0004100117296419717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document +0.00042469230366022745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document +0.00041744151905374254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document +0.00022835699906752945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document +0.0004380161085387397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document +0.00044803212381807456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document +0.00040554932796137236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document +0.0004234508646347761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document +0.00043341209652360653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document +0.00023966604734537185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document +0.000259165907316014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document +0.0004270653021833602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document +0.0004341547032162028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document +0.0004111478117275994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document +0.0004299383567984396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document +0.0004241899124590779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document +0.0004502719349364145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document +0.00038994621469645615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document +0.0003859912398894952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document +0.0004247535950310557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document +0.000386982084327716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document +0.0004196451040053251 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document +0.0004096278509782259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document +0.0004373334932695721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document +0.0004180889975240641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document +0.00042079636929672745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document +0.00038063574611812913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document +0.0003817505891515542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document +0.0004420096268860222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document +0.00039182670726410623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document +0.0003635667850372299 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document +0.00041564996472055667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document +0.000400529358757286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document +0.0003939113874958451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document +0.00039066622068940996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document +0.0004290098538807143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document +0.0004240739958197099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document +0.00040775392659215333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document +0.0004091634200396925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document +0.00042299190476617914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document +0.0003701492680344151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document +0.0003807353844384635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document +0.00038813507771983156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document +0.00040072346558408346 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document +0.0003603595180423597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document +0.00038799421353112465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document +0.00037575235582264926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document +0.0004239190342959713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document +0.0004606044799136546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document +0.00045107950652529253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document +0.0004391947201871058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document +0.0004457516661123035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document +0.0004301297170991686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document +0.00044661704164586694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document +0.0004438849846114837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document +0.0004444205734316823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document +0.0004190924165303394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document +0.00043942581131677875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document +0.00021568459798090663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document +0.0003814929225407199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document +0.0003217453179359235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document +0.00031719591470267974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document +0.00032434115726922137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document +0.0004079911120371051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document +0.000329492766381148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document +0.0003845916162001633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document +0.0003835208964390098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document +0.00037847334157173194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document +0.00038296039903791865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document +0.00037896336828472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document +0.00037620974396391355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document +0.00037420590727111843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document +0.000340490625886403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document +0.0003078314411035827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document +0.00034153990750656097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document +0.0003308858103982067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document +0.0003452640607156025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document +0.00033095276418403455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document +0.0003116308995860414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document +0.00032446713226408477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document +0.0003015816821912984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document +0.00031612418775706894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document +0.0003278516344971041 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document +0.00033079446736097217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document +0.00032278977146550837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document +0.00032065272988207914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document +0.0003936696452406576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document +0.0003450109536627789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document +0.0003339787189919641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document +0.0003284303856176974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document +0.00033652677276843477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document +0.0003257822443845694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document +0.0003293985569149334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document +0.0003310360260148262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document +0.0003233770986418526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document +0.0003172280092149422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document +0.0003160674744292835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document +0.00030931090289598506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document +0.0003093173886443107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document +0.00033167847081104083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document +0.00031131501311729723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document +0.00031046608876279845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document +0.00030569235942207244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document +0.00030777943671285197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document +0.00029303314290956683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document +0.0003045824546400205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document +0.00030360880677729793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document +0.00031646239964835433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document +0.0003129122300603785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document +0.00031060464956661433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document +0.000311819032500067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document +0.0002977872483902282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document +0.0003009448600922438 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document +0.00028610292098537774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document +0.0002988326876216654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document +0.00028550828372819075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document +0.0002830381750875739 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document +0.0002848495855927156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document +0.0002856443760308144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document +0.00027442895344188584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document +0.0002681160554049462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document +0.0003421482544126989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document +0.0004005872948449718 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document +0.0003930123959320308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document +0.0003867271832275778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document +0.000380805140455254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document +0.0003814769861947819 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document +0.00038025170883282324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document +0.0003738026647867475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document +0.00018960856915036276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document +0.0003697177501953134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document +0.00036674194328136693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document +0.00036447406838697555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document +0.00036686410861101255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document +0.00035915267825103423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document +0.0003624758404026675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document +0.0002822812140180794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document +0.00030620512946920813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document +0.000294249776520589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document +0.00030238536967523434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document +0.00029509593361580754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document +0.0002906912701830899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document +0.0002921944165474959 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document +0.00028358919691127954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document +0.0002813182772323272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document +0.00027442640800299205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document +0.0002747820342933984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document +0.0002747584403979717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document +0.00027499129634862444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document +0.0002712050404257197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document +0.0002616256943143254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document +0.00026769938929002815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document +0.00038396081322727017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document +0.0003863140490027991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document +0.00037702277513203237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document +0.0003633274156107032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document +0.0003587473889240435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document +0.0003507672084278415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document +0.00033776425499780385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document +0.0003377914127574796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document +0.00032948015659161326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document +0.00033245638541392985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document +0.00031080707640648695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document +0.0002976903331149755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document +0.0002965121463725523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document +0.0002933849695266647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document +0.0002837035078508233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document +0.00028684569079589323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document +0.0003145192320802359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document +0.0003566937253273515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document +0.0003470199109592918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document +0.0003060245312041868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document +0.0002650817213818789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document +0.0002643604938780134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document +0.000299350876031416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document +0.0003178540797697938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document +0.000271850367887767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document +0.00031349896596549 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document +0.00031749734412765755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document +0.0003791137842391209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document +0.0003742334169957992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document +0.0003705639757351107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document +0.0003126986769797042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document +0.00031038132814561196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document +0.00036464437173804883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document +0.0003569480488951322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document +0.0003541239221619106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document +0.00035315297411308053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document +0.0003572451925404141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document +0.0003514986129411253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document +0.0003521798298425866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document +0.00034553677439244716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document +0.000349004719809412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document +0.0003468247484872769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document +0.0003465822608356558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document +0.00035410983132162007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document +0.0003487908354969444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document +0.0003479024763238147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document +0.000341412530646823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document +0.00034451316273667034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document +0.0002618849993484869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document +0.00026788679978901144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document +0.00027450670773227214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document +0.0002661273129899329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document +0.00026836569676402957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document +0.00026155876975483236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document +0.0002609276830117151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document +0.0002644161630512771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document +0.00036789208972872557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document +0.00037829849439990513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document +0.0003788894943523098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document +0.0003617207777959397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document +0.0002541334487248998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document +0.0002707945538071073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document +0.00027046282716455214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document +0.0002652443167243215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document +0.0002685859923850986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document +0.00025734961751176414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document +0.000259041720872915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document +0.00025340107274823446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document +0.00025757135121837893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document +0.00025617700500574084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document +0.0002566931670562857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document +0.0002543871190716101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document +0.00024997565589481713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document +0.0002954079779456287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document +0.00034890741135252835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document +0.0003473298137731525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document +0.0003296959618486435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document +0.0003304520061604598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document +0.00032377956175729824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document +0.00031700696295168713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document +0.0003060382346081943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document +0.0003012003005056863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document +0.0002981074073993884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document +0.0002922128825950705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document +0.000348901087722931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document +0.0003408286289467841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document +0.0003410649680770183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document +0.0003358524215576502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document +0.0003343661874989231 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document +0.00032810573699389156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document +0.00032261449539097497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document +0.0003162694866049203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document +0.0003158381156468853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document +0.000317376061083603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document +0.0003125788639953052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document +0.0003010105041885602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document +0.0003065865059090678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document +0.0003084275726508053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document +0.00030966560718296085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document +0.0002957728057853081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document +0.00029904164542325336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document +0.0002955358888729187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document +0.00028692976446931544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document +0.0002923476214935797 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document +0.0002893691697212419 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document +0.0002855895211981585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document +0.00027968347097626246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document +0.0002810783462604979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document +0.00027794080455729715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document +0.00034784376461416953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document +0.0003488347959010943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document +0.00034790583710250724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document +0.000345913166618151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document +0.00033801936268066675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document +0.0003290591130212315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document +0.00034051399521366823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document +0.00032470943131841784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document +0.00031679540050914276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document +0.00031814596342422325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document +0.0003156466289485036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document +0.00029985010879003633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document +0.0002905176377776361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document +0.0004206836775460856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document +0.00020660449162246918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document +0.0003461727254468087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document +0.00020592870907067763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document +0.00034173505299233005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document +0.0004052437256652738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document +0.0004080650901351697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document +0.00039778184149144276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document +0.00039046311464950275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document +0.00039043444911071384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document +0.000388575704932843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document +0.00019737533145666597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document +0.00037610755595812403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document +0.00037315400127598317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document +0.00037415028580922163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document +0.00036694041707212337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document +0.00018947219857306515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document +0.00037046050826533545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document +0.0003587440768559087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document +0.00034623936498708903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document +0.0003502289592617922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document +0.00034692398063649823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document +0.000339340809421849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document +0.0003360510394816983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document +0.0003354673850814145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document +0.00032937682875877047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document +0.00032844505049317715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document +0.00028287199339908627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document +0.0002795217197003578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document +0.00028048955601883463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document +0.0002769326396439027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document +0.0002727090021299243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document +0.0002726577841024554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document +0.00026663619593455374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document +0.00026068042672138127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document +0.0002637704114326801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document +0.0002593043567100412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document +0.0002599897110113453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document +0.0002435078682758859 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document +0.0002450530071379054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document +0.00024233331983743606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document +0.0002934750947999535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document +0.00033241226364044474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document +0.00032938406090272075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document +0.00032778705403953246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document +0.00032184551480398754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document +0.00031874002264945737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document +0.0003165319685666433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document +0.00031307071173376295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document +0.00031119524184911957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document +0.0003102253344576429 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document +0.0003088976240383192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document +0.0002951410823077708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document +0.00029772657676757413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document +0.0003056048989909935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document +0.00031991305381648026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document +0.00030890256978362426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document +0.0003109382904091933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document +0.00031035798529690644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document +0.00030741666395911753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document +0.0002989918594861846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document +0.00029569635443989434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document +0.0002973992445667285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document +0.000293397351001072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document +0.00028737817438047954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document +0.00028252738144009747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document +0.0002805511898623541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document +0.0003718020784620472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document +0.0003499713845765235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document +0.00034283547445326676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document +0.00031464759888838765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document +0.00033188946446414833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document +0.000326084432195463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document +0.0003764568303917893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document +0.0003604955598858414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document +0.0003655654554133222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document +0.00035762304033750504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document +0.00038478883950347103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document +0.00027735714341247454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document +0.00028139534607773563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document +0.00019777292251713763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document +0.000285571704874486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document +0.00028543482146244363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document +0.00019434234484256758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document +0.00027854908176986763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document +0.0002847068039566143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document +0.00028672356943064853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document +0.00027782687605808177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document +0.0002843539634105203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document +0.0002894748379090401 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document +0.0002868852440186493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document +0.0002818504885373851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document +0.00028680112812941034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document +0.00019258978168723977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document +0.00028760637934715155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document +0.0002820439443912918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document +0.0002831001054410018 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document +0.00029001901552467397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document +0.00027779449377883156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document +0.00019949837437516796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document +0.0002907306472984446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document +0.00027814858381318327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document +0.00019472790889161432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document +0.00020472626596924125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document +0.0002870045081974301 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document +0.00019812241927078482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document +0.0002817553333369554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document +0.00027829782796642117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document +0.00028289431732284113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document +0.0002795526296717729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document +0.00027682829988044574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document +0.0002895432402719184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document +0.0002823174903941811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document +0.00028170972351837796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document +0.00027807915877838826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document +0.00028588515681452956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document +0.00028112324090816726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document +0.00020636178289985485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document +0.00019447255290980535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document +0.0002850824220591452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document +0.00027856429520116784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document +0.0002820880676635633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document +0.00028943902215995714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document +0.0002676366291085329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document +0.00023806333809954687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document +0.00024526460430233455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document +0.00023876876664622726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document +0.00023379770334179805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document +0.00024175151269138382 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document +0.00023386583242595706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document +0.00023771797150160827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document +0.0002262748967483896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document +0.0002408148346432682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document +0.00023398651720444235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document +0.00022989433874474592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document +0.00023948500543957772 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document +0.0002331594076859196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document +0.00023375132439600242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document +0.00023923410909668642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document +0.00023952796315562954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document +0.0002327466076905069 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document +0.00023082758956797212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document +0.0002240509275524448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document +0.00022798879995765268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document +0.000221172516774386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document +0.00021767045123534623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document +0.00021982832794804484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document +0.00021971626543789102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document +0.00022566565206920132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document +0.0002181984894194856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document +0.00021831417549554653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document +0.00021601405421187145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document +0.00022275733725519607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document +0.00021847734911973986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document +0.0002243591012664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document +0.00021688758139483833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document +0.0002182953624789215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document +0.00020475155724026002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document +0.00021498078062960065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document +0.0002157914337233064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document +0.00021781838494967963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document +0.00021723242266814558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document +0.0002176782686553837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document +0.0003486179404943968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document +0.00034882846352857634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document +0.00031400868448352596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document +0.00030273484020011963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document +0.00029895889118145404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document +0.00029770764609621714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document +0.0002990181332116852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document +0.00029653733972285996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document +0.00029624649222942476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document +0.00029625609720203576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document +0.00029731928930852147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document +0.00029011721326148513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document +0.00028849788197494655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document +0.00021601278623858145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document +0.00021319599281739178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document +0.0002153325290600083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document +0.00018566946174516558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document +0.00020736824394291617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document +0.00020857419820128004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document +0.00020058526129536423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document +0.00020745812166665217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document +0.00020652171015271702 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document +0.00020643808911278608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document +0.00020040513914482103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document +0.00020598050188272898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document +0.0001969184139343296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document +0.0001972748812937012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document +0.0002038556751586195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document +0.00020245186011313464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document +0.00019950381422038783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document +0.00020837055459665258 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document +0.00020371856218246096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document +0.00019537612301625791 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document +0.00019914984508813857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document +0.0002053787713691309 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document +0.00019082100541008637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document +0.00020397153334531813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document +0.0002021462693077317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document +0.00019609357008124035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document +0.00019693256622486236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document +0.00020007239732428112 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document +0.00020467075741591954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document +0.00019584883400022932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document +0.00019135050391176972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document +0.0003362829834208298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document +0.00034013691154784095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document +0.00033215887031941976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document +0.00032681189065396707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document +0.0003149138485493094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document +0.00030179177307540077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document +0.0002923278437581119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document +0.00029470052278994486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document +0.0002994095093045731 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document +0.00029033525096085037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document +0.00029390798852496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document +0.0002916230924130842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document +0.00029419886374594913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document +0.0002865469756730764 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document +0.00021191292549942086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document +0.00021369664817409847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document +0.00021612485624266726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document +0.00022242192634588478 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document +0.00014605095659989698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document +0.00022070626106341693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document +0.0002174420774054071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document +0.00021325858963116995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document +0.0002124322999488052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document +0.0002081218896969054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document +0.0002108710211556957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document +0.00020686867095978426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document +0.00020895752681041895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document +0.00020741922266415738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document +0.0002069112657197308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document +0.00020644627473468118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document +0.00020332991338121604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document +0.0003560895677789848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document +0.00032915779111908214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document +0.00033810613317040864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document +0.00033729626594036923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document +0.00033550342864602944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document +0.00034173474024556906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document +0.000331505340748827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document +0.0003270050330117195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document +0.00032585275329172556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document +0.0003143383203190604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document +0.00031655199110388894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document +0.00030738872158476413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document +0.00030838388352699285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document +0.0003053596995351888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document +0.00031836304739584593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document +0.000315315435873905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document +0.0003087116248965243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document +0.00030396790625537645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document +0.0003335812246032149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document +0.00034570956323095843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document +0.00034563035636675786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document +0.00033411265479076335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document +0.00034439191141692787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document +0.0003364483125496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document +0.0003299500453608033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document +0.00033163377700074837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document +0.00032638649660627673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document +0.00032616167939645234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document +0.0003205289298760723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document +0.00031939393740815355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document +0.00031593164066731296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document +0.00031928871111254405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document +0.00029670189073175004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document +0.00020517703846735904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document +0.00020128418186172073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document +0.00019662723895606717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document +0.0001981157042081407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document +0.00019703489037041608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document +0.00019079796331785068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document +0.0001909352306690079 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document +0.00018824662295261396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document +0.00019864275319325954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document +0.00018818516521649587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document +0.00018875694972812844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document +0.00018231621170645482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document +0.00018349407845798273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document +0.00018088971427746906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document +0.00018296284236327237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document +0.0001876011825819916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document +0.000329052068725176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document +0.00032223616273648536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document +0.00031272564089633955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document +0.00031621609908414494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document +0.0003117213560911235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document +0.00030218064069945934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document +0.00030658916600512085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document +0.0002915863534115821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document +0.0002940280138374372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document +0.00029067860468866085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document +0.00028529228063135635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document +0.00028336893301452256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document +0.0002794668089130099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document +0.00021681361378827842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document +0.0001484664674497246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document +0.00021950558378215133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document +0.00021806860758808645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document +0.00021819568718852282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document +0.00021626925931585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document +0.0001464536143077762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document +0.00021432777088808917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document +0.000213473805865147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document +0.00021397067253964538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document +0.00020758957647437263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document +0.00020687124337683314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document +0.00020630057046511005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document +0.0002091166859352538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document +0.00020777355025615267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document +0.00020709287641496176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document +0.00020736464660577094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document +0.00020062246741862607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document +0.00020693207561942915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document +0.00021151004871893024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document +0.00019930249098689716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document +0.00021589710041231824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document +0.00021369204789905741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document +0.0002147099923936778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document +0.00021077531190389536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document +0.0002100509829113836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document +0.00021185362601571124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document +0.00020722136637339565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document +0.00020300093701169531 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document +0.00019859737993313477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document +0.00019971314372100164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document +0.00019549908270269278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document +0.00019649820843534028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document +0.00019619415513498067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document +0.00019493006120377898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document +0.00019499409035775506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document +0.00019252988593634277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document +0.00019440768268686405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document +0.00018747161324755577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document +0.0001879575932372779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document +0.00019040707058357506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document +0.0001871931095090703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document +0.00020112966223017096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document +0.00020516878165311017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document +0.00020664735191740533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document +0.00021041398572882962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document +0.00020397992929690396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document +0.0002039978580295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document +0.00020592785601142126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document +0.0001990755527445265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document +0.00019729564847798732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document +0.00019958182230527032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document +0.0001985037302636386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document +0.00020204130355115716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document +0.0002000296401958085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document +0.0001983064832295463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document +0.00019663108484195617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document +0.00019510678560556523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document +0.0001873284057063206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document +0.00019311553072495885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document +0.00034652137288816547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document +0.0002813690318850024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document +0.00027697649713138685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document +0.0002755419092534421 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document +0.0002681583054440219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document +0.00026945753192750824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document +0.00026169470768245737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document +0.00026437008960810825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document +0.0002637294838228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document +0.00026491867965088836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document +0.00025504483625138986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document +0.0002545040623796586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document +0.0002546682814073622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document +0.00025545439487142615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document +0.0002626896557978271 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document +0.00025092040940402784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document +0.0002589154885863872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document +0.00024106160482721467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document +0.0002483289690087987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document +0.0002388930282784437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document +0.00024006340759273874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document +0.00023765248178029045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document +0.00023061351965578936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document +0.00024954224883546477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document +0.00017861017233018525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document +0.00017810832743667658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document +0.00017599709170759497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document +0.00017462723516505223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document +0.0002906316527068669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document +0.00033762141066247166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document +0.00017170670574152494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document +0.00017258674515137717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document +0.0002815386173173926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document +0.0002996845935618989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document +0.0002735268488987296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document +0.0002971738713071517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document +0.0002942690674002763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document +0.0003322222207729567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document +0.0003378721656198464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document +0.00018307262621851067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document +0.00033956081502775057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document +0.00031604820927876276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document +0.00028805657681088917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document +0.00026312293321215633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document +0.00034366936722921455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document +0.0002865256504406559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document +0.0003063615195861786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document +0.00028412791619666136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document +0.00028060835132727154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document +0.00032544974761560506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document +0.0002647177833217225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document +0.0003152621884896575 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document +0.0003054625140336913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document +0.00031183308312292263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document +0.00018175026696621178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document +0.00017699918328872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document +0.00018222339261441908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document +0.00018348005930964137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document +0.0001810735993810541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document +0.00030846441282038914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document +0.0002972326889310354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document +0.00017433421318235594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document +0.00032799458649525895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document +0.00032482130048512673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document +0.00031943465668672475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document +0.00029615593630484517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document +0.0002893126939511001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document +0.0002849288351723284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document +0.00028383906633569267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document +0.00028072526091262615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document +0.000284239564292377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document +0.0002778903109432523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document +0.0002771644389501471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document +0.0002733316182319337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document +0.00026362539185869363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document +0.0002636325383220217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document +0.00026740622442302886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document +0.0002646771971853427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document +0.0002628566720605389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document +0.0002644760695434766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document +0.0002623837702310999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document +0.00026088722976772894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document +0.0002567065374799158 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document +0.00018857382101207726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document +0.00019036580399817203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document +0.00018348828065261222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document +0.00018491851780345073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document +0.00018904887260080187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document +0.0001875609304251801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document +0.00018393034720015817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document +0.00018419795526114903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document +0.00018699955623404795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document +0.00018276256902965128 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document +0.00017698045695190812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document +0.00018104650132303642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document +0.00017758206731279688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document +0.00017131402995103497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document +0.000175944428350446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document +0.0003416745727147391 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document +0.0003163259373952889 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document +0.0002804489269172448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document +0.00028748272397403175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document +0.00027603318345630605 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document +0.000271638824679648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document +0.0002763761210210942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document +0.00026501984873172717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document +0.00026422486894694714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document +0.0002686339100849262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document +0.0002610837453940606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document +0.000260974343729353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document +0.0002599403837029134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document +0.0002937273113238609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document +0.0003341790732600504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document +0.0002620661576600244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document +0.0003027929169239288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document +0.00031944039129326894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document +0.00019025676304139009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document +0.00018680910145009907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document +0.00034215840419416437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document +0.00018618120812119364 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document +0.00018605853095599425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document +0.00018120712626096538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document +0.00018315079292495327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document +0.00018362556449041974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document +0.0001780024456718171 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document +0.00033296526436178697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document +0.0001802398632282846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document +0.00017340263100798256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document +0.00017755840547238697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document +0.00018419413735260606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document +0.00017869518174591322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document +0.00017526271460129484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document +0.00017852168597981907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document +0.00017566536156787157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document +0.00017589867964432936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document +0.00017831487394075305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document +0.00017837310528935862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document +0.00018200908814216548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document +0.0001795136627511612 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document +0.0003414021775300033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document +0.00017177291787788502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document +0.0003441900648571877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document +0.0003394534597060673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document +0.0003236887233114832 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document +0.0001639544129688747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document +0.00019137443753211255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document +0.00018575146284680153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document +0.00019184792863440243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document +0.00018966043065679055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document +0.00017968851317035848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document +0.00018479881897661546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document +0.0001813642692683015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document +0.0001686449798983066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document +0.00018516104592230446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document +0.00031283726601066385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document +0.0003248607542883853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document +0.00031583241601202365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document +0.00031238270857730376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document +0.000307150592403979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document +0.00029443829986847044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document +0.0002942723732234677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document +0.00023514930666443422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document +0.0020776328951453444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document +0.0021768234410538883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document +0.002106973549276289 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document +0.002110915756171751 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document +0.0017032382109816464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document +0.0019047944877712286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document +0.0019402711744016077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document +0.0006264790011223686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document +0.0017885401938106643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document + diff --git a/ALCF/data-lists/sunspot/data_file_list_books.txt b/ALCF/data-lists/sunspot/data_file_list_books.txt deleted file mode 100644 index 9187565a5e..0000000000 --- a/ALCF/data-lists/sunspot/data_file_list_books.txt +++ /dev/null @@ -1,3 +0,0 @@ -0.0031007020167215667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/books-0000_text_document -0.003100207465277759 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/books-0001_text_document -0.000999090518000674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/books-0002_text_document diff --git a/ALCF/data-lists/sunspot/data_file_list_c4.txt b/ALCF/data-lists/sunspot/data_file_list_c4.txt deleted file mode 100644 index ca7df1839e..0000000000 --- a/ALCF/data-lists/sunspot/data_file_list_c4.txt +++ /dev/null @@ -1,86 +0,0 @@ -0.0011545953050729803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0000_text_document -0.0011570295715413383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0001_text_document -0.001156438391210766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0002_text_document -0.0011556820995190797 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0003_text_document -0.001156780334924253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0004_text_document -0.0011563528368937514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0005_text_document -0.0011574632716369762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0006_text_document -0.0011577445131424494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0007_text_document -0.0011599182963630329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0008_text_document -0.0011550792360663698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0009_text_document -0.001154948574643344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0010_text_document -0.0011560157369398198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0011_text_document -0.0011551344387810997 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0012_text_document -0.0011586914190552 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0013_text_document -0.00115559584811127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0014_text_document -0.0011562917764239204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0015_text_document -0.0011582019252872318 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0016_text_document -0.0011585605528399534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0017_text_document -0.0011567600261132287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0018_text_document -0.0011561323235067436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0019_text_document -0.0011568948157687324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0020_text_document -0.0011562184926986983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0021_text_document -0.001155171968076667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0022_text_document -0.001156245876059478 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0023_text_document -0.0011591826911770261 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0024_text_document -0.0011564400126070828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0025_text_document -0.0011571005158517765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0026_text_document -0.0011560050453907214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0027_text_document -0.0011559074476966407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0028_text_document -0.0011567638698290205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0029_text_document -0.0011558972055942165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0030_text_document -0.001157532269673901 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0031_text_document -0.0011559883017581377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0032_text_document -0.001155556362078353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0033_text_document -0.0011544735837522018 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0034_text_document -0.0011547315955415466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0035_text_document -0.0011570980852521353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0036_text_document -0.0011562552591307868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0037_text_document -0.001156640315842092 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0038_text_document -0.0011587257748187634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0039_text_document -0.0011563083526351268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0040_text_document -0.0011554464046007336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0041_text_document -0.001155442922136426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0042_text_document -0.0011557081619451221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0043_text_document -0.001156421357082161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0044_text_document -0.0011562730825316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0045_text_document -0.001157525507046117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0046_text_document -0.0011552936629887162 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0047_text_document -0.0011578959437852875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0048_text_document -0.0011568910557636293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0049_text_document -0.0011578444955946039 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0050_text_document -0.001157076096248001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0051_text_document -0.0011568459536403974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0052_text_document -0.0011555352450605598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0053_text_document -0.0011557650508322967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0054_text_document -0.0011567625802857914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0055_text_document -0.0011568533734967437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0056_text_document -0.0011562185375437102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0057_text_document -0.0011558740426473278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0058_text_document -0.0011549825990520978 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0059_text_document -0.0011572314079774744 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0060_text_document -0.0011576031815962752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0061_text_document -0.0011567937670018521 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0062_text_document -0.001154956951193276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0063_text_document -0.001157226898064118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0064_text_document -0.001156096958730414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0065_text_document -0.001155844223704128 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0066_text_document -0.0011571187084765205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0067_text_document -0.0011573954893981501 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0068_text_document -0.0011566700251641518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0069_text_document -0.0011550051959552815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0070_text_document -0.0011559629359246125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0071_text_document -0.001157971629210032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0072_text_document -0.0011561725903411443 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0073_text_document -0.001157160385935682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0074_text_document -0.0011568864860569239 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0075_text_document -0.0011576433208715313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0076_text_document -0.0011571382379808948 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0077_text_document -0.0011590178523739284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0078_text_document -0.001156347684201892 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0079_text_document -0.0011552550374817486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0080_text_document -0.0011570794132840427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0081_text_document -0.0011570932061148482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0082_text_document -0.0011561938025300182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0083_text_document -0.0011560757016965283 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0084_text_document -0.00019284851714729888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/c4-0085_text_document diff --git a/ALCF/data-lists/sunspot/data_file_list_cc.txt b/ALCF/data-lists/sunspot/data_file_list_cc.txt deleted file mode 100644 index 4b9a797878..0000000000 --- a/ALCF/data-lists/sunspot/data_file_list_cc.txt +++ /dev/null @@ -1,2880 +0,0 @@ -0.0002329030984435853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0000_text_document -0.00023018699207949078 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0001_text_document -0.00024373839803694205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0002_text_document -0.00023608269234913788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0003_text_document -0.00024813091225197464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0004_text_document -0.00023520818074126314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0005_text_document -0.0002374607329273171 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0006_text_document -0.00023738412849923294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0007_text_document -0.0002443634316582533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0008_text_document -0.00023847622533166118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0009_text_document -0.00023199871587697545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0010_text_document -0.0002385337709567312 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0011_text_document -0.0002432839071745339 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0012_text_document -0.00023508523674007346 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0013_text_document -0.00032603226617680567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0014_text_document -0.00023789141182395846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0015_text_document -0.0002461407443245122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0016_text_document -0.00023499257215518966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0017_text_document -0.00024846537508068473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0018_text_document -0.0002386611981191132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0019_text_document -0.0002476214516386151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0020_text_document -0.00023922963334203518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0021_text_document -0.0002566637890877035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0022_text_document -0.0002480836116312675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0023_text_document -0.0002500957846859012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0024_text_document -0.00023232303192858133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0025_text_document -0.0002402109920207785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0026_text_document -0.00032458741378655037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0027_text_document -0.00023711130623699136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0028_text_document -0.0002473092752915358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0029_text_document -0.00024517111812673547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0030_text_document -0.00024145261714879915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0031_text_document -0.0002441832095655324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0032_text_document -0.00024533720808111173 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0033_text_document -0.00024615543201451354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0034_text_document -0.00029788578618284437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0035_text_document -0.00026821245945822444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0036_text_document -0.0002451138188102186 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0037_text_document -0.00023812823651070536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0038_text_document -0.00023799603175215714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0039_text_document -0.00024128396884325748 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0040_text_document -0.00024158008848876737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0041_text_document -0.00024722330373436316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0042_text_document -0.00023308404070500205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0043_text_document -0.0002554252556503107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0044_text_document -0.0003132025339147037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0045_text_document -0.00024278622445373792 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0046_text_document -0.0003214585004572529 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0047_text_document -0.0003329131703028111 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0048_text_document -0.0002361664236831262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0049_text_document -0.0002643368247294079 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0050_text_document -0.00024766538637149724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0051_text_document -0.0002627167479901225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0052_text_document -0.00025033496855447236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0053_text_document -0.00024160037266449382 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0054_text_document -0.00022926708072112655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0055_text_document -0.00023577632399723273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0056_text_document -0.00024916378421745264 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0057_text_document -0.00024065956580145883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0058_text_document -0.00032914757231594763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0059_text_document -0.000382735213415281 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0060_text_document -0.00019876415914729903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0061_text_document -0.0002455041228482986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0062_text_document -0.0002360975192355561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0063_text_document -0.00035687225557611647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0064_text_document -0.00034010734287544296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0065_text_document -0.00024289772720050695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0066_text_document -0.0002298464162081398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0067_text_document -0.00032731880189343956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0068_text_document -0.00024593154050122983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0069_text_document -0.00024184757636917526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0070_text_document -0.0002619883069796127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0071_text_document -0.00023707630401459816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0072_text_document -0.0003648802259322563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0073_text_document -0.00034821518419266554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0074_text_document -0.00025687739808269634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0075_text_document -0.00025210376457187776 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0076_text_document -0.00025341417049958763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0077_text_document -0.00026096750660126574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0078_text_document -0.0002557323323244081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0079_text_document -0.0003306928457892949 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0080_text_document -0.00034038835131844906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0081_text_document -0.00025944099107910257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0082_text_document -0.00011523229485833962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0083_text_document -0.0002577986281049885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0084_text_document -0.00011763411767853355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0085_text_document -0.00025348268598695737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0086_text_document -0.00032333206004171266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0087_text_document -0.00030755087408648437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0088_text_document -0.00023006508933660387 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0089_text_document -0.00023529378653763827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0090_text_document -0.0002316006671871909 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0091_text_document -0.0002467080329046101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0092_text_document -0.0002812385280195195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0093_text_document -0.0002999655363830447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0094_text_document -0.00030366253916544147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0095_text_document -0.00034483134052353947 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0096_text_document -0.0002264669007084511 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0097_text_document -0.0002601377797129039 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0098_text_document -0.000243683175313779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0099_text_document -0.0002458323373867855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0100_text_document -0.00023061358738763293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0101_text_document -0.0002383240957413279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0102_text_document -0.00024652411741760106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0103_text_document -0.00024356064371899462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0104_text_document -0.00023826916720633669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0105_text_document -0.00023583636824734604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0106_text_document -0.00023310828235332517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0107_text_document -0.00024133699058477928 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0108_text_document -0.00023757818755491814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0109_text_document -0.00024650642737935284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0110_text_document -0.00023587507176169633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0111_text_document -0.0002394516652010616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0112_text_document -0.00026115753562452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0113_text_document -0.00023919185015293048 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0114_text_document -0.0002328737948830104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0115_text_document -0.0002449581587150213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0116_text_document -0.00023488566807302266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0117_text_document -0.0002461692650286432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0118_text_document -0.00023193321359714746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0119_text_document -0.00024814319189332457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0120_text_document -0.0002502054369100928 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0121_text_document -0.0002294119999864264 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0122_text_document -0.00023986985689573848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0123_text_document -0.00023333209217509475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0124_text_document -0.0002268247786450586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0125_text_document -0.0002289098412617007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0126_text_document -0.00023635954118858026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0127_text_document -0.00024647215050850076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0128_text_document -0.00024326708810109974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0129_text_document -0.0002931046025004214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0130_text_document -0.00022529330733557138 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0131_text_document -0.00024288319647667783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0132_text_document -0.0003170441859608398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0133_text_document -0.00032183678547706126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0134_text_document -0.00020557308761968548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0135_text_document -0.00020890924417592562 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0136_text_document -0.00021111297420597103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0137_text_document -0.00021993650550023244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0138_text_document -0.0002123163519100286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0139_text_document -0.0002103629651549111 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0140_text_document -0.00021370932994199264 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0141_text_document -0.00020399994203827728 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0142_text_document -0.00021563034464531022 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0143_text_document -0.0002119386189866467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0144_text_document -0.00020333697838057754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0145_text_document -0.00020812225502998168 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0146_text_document -0.0002192034455873437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0147_text_document -0.0002146433860256116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0148_text_document -0.00022498320338620924 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0149_text_document -0.00020605974297327904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0150_text_document -0.00020911517614300505 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0151_text_document -0.00022086517759478398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0152_text_document -0.00021332423639106333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0153_text_document -0.00020576019154376813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0154_text_document -0.00020504347709097317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0155_text_document -0.00020777754226086552 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0156_text_document -0.00021294564928541406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0157_text_document -0.00020775275197134613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0158_text_document -0.00021002644029417448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0159_text_document -0.00021013797882725636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0160_text_document -0.00019076903434985646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0161_text_document -0.00019137766426576477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0162_text_document -0.0001841037351078922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0163_text_document -0.0001952863228508793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0164_text_document -0.00018602295481911772 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0165_text_document -0.0001931370361427833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0166_text_document -0.0001801085437374987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0167_text_document -0.000188289716886196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0168_text_document -0.0001852865203803285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0169_text_document -0.00018892492640726607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0170_text_document -0.0001867706345514145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0171_text_document -0.00018688900901065678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0172_text_document -0.00018978617486719294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0173_text_document -0.00019074400515584856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0174_text_document -0.00018895644551080948 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0175_text_document -0.0002014139475504348 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0176_text_document -0.00019178652165604014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0177_text_document -0.00019538713758341256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0178_text_document -0.00019221603071045457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0179_text_document -0.00018559646736351844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0180_text_document -0.00018839424919962872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0181_text_document -0.00031369302654824313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0182_text_document -0.00029283955302533026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0183_text_document -0.0003003216050130351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0184_text_document -0.00030560088357585723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0185_text_document -0.00030852297965873606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0186_text_document -0.00030137151200383515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0187_text_document -0.000287675564141583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0188_text_document -0.0002865118305148982 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0189_text_document -0.0002942394807592494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0190_text_document -0.0002892999122858095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0191_text_document -0.00029726222843843534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0192_text_document -0.0002865106197035132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0193_text_document -0.0002831383377282607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0194_text_document -0.00029911101649033976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0195_text_document -0.0002879193266837814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0196_text_document -0.000293888834619463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0197_text_document -0.00028471984768159116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0198_text_document -0.0002880090219919074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0199_text_document -0.0002916398711835823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0200_text_document -0.00029790830243728387 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0201_text_document -0.00028328873748227157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0202_text_document -0.000295084201372288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0203_text_document -0.0002870500420988019 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0204_text_document -0.00028061238206088403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0205_text_document -0.00028268741759946835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0206_text_document -0.0002832900433124873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0207_text_document -0.0002821269671667503 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0208_text_document -0.00028388007298379026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0209_text_document -0.0002811354392519064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0210_text_document -0.0002811576793347316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0211_text_document -0.000291266961761568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0212_text_document -0.0002930917058536775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0213_text_document -0.00029247722771384336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0214_text_document -0.00030253733431717943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0215_text_document -0.0002988938219536017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0216_text_document -0.0003002888817617649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0217_text_document -0.00028686614758997625 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0218_text_document -0.00032046548753382687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0219_text_document -0.00027752519729998216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0220_text_document -0.00026529350985605245 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0221_text_document -0.0002654493836819182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0222_text_document -0.00026232091015406547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0223_text_document -0.0002599081762104853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0224_text_document -0.0002835817651903514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0225_text_document -0.00026294839748242733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0226_text_document -0.0002610835823452124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0227_text_document -0.000260110886669002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0228_text_document -0.000253371820236557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0229_text_document -0.0002581811396117453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0230_text_document -0.0002514852630632709 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0231_text_document -0.00025726705673313424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0232_text_document -0.00025592912496079053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0233_text_document -0.00025012268192543976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0234_text_document -0.00024391340520007348 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0235_text_document -0.0002384383639062725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0236_text_document -0.00023975576001149118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0237_text_document -0.0002338016280970284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0238_text_document -0.0002439200883556984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0239_text_document -0.00024142268942556778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0240_text_document -0.0002427966777591219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0241_text_document -0.00024280144153436732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0242_text_document -0.00024065658615901044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0243_text_document -0.00024455143739741974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0244_text_document -0.00023239795390635735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0245_text_document -0.0002582911684560293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0246_text_document -0.00024625861259252923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0247_text_document -0.0002391576312805854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0248_text_document -0.000238078180343909 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0249_text_document -0.00023486425304981024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0250_text_document -0.0002355893518655022 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0251_text_document -0.0002366129403678232 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0252_text_document -0.00023595832035066449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0253_text_document -0.00023327574008525872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0254_text_document -0.00024148789011315923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0255_text_document -0.0002373778500991465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0256_text_document -0.00023955987733466374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0257_text_document -0.000230949882722363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0258_text_document -0.00023691636140836262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0259_text_document -0.0002296963977634624 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0260_text_document -0.0002332661069034444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0261_text_document -0.00023843042502126992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0262_text_document -0.00023511746712743498 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0263_text_document -0.0002347369877896436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0264_text_document -0.0002323753243697275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0265_text_document -0.00026669348300156857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0266_text_document -0.00025799845912273273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0267_text_document -0.00027628560903016796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0268_text_document -0.00026519284616643963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0269_text_document -0.00026441815097637077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0270_text_document -0.0002662131391195505 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0271_text_document -0.00027728803868991606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0272_text_document -0.0002769764618252775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0273_text_document -0.00027646939593325287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0274_text_document -0.0002624622460988396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0275_text_document -0.0002597094641937235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0276_text_document -0.00026414993058715923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0277_text_document -0.00027056496256926013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0278_text_document -0.0002594411680362496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0279_text_document -0.00026263805833060905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0280_text_document -0.0002560343870682032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0281_text_document -0.0002624349038750109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0282_text_document -0.00025919416325410714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0283_text_document -0.0002611522977423299 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0284_text_document -0.00023679129688303509 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0285_text_document -0.0002424050866477902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0286_text_document -0.00022701047777126036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0287_text_document -0.00023885339653333248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0288_text_document -0.00024106734540671208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0289_text_document -0.0002258801520250309 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0290_text_document -0.0003279882524990489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0291_text_document -0.00033565261995537515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0292_text_document -0.0003289323356607256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0293_text_document -0.0003074095430777535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0294_text_document -0.0003207680812935341 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0295_text_document -0.00031455349141131964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0296_text_document -0.0003292847953027658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0297_text_document -0.0003336588045388259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0298_text_document -0.00031509118791912046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0299_text_document -0.0003142598967986839 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0300_text_document -0.00030783273695855995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0301_text_document -0.0003180584048660508 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0302_text_document -0.0003132932087805931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0303_text_document -0.00031883257979717144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0304_text_document -0.00030944547256766847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0305_text_document -0.00030308947812968015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0306_text_document -0.00027546560713402303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0307_text_document -0.0002849896883269672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0308_text_document -0.00028854314233644503 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0309_text_document -0.00028915140229591915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0310_text_document -0.00028785031389006415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0311_text_document -0.00029386612956137296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0312_text_document -0.00027190973100817075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0313_text_document -0.00028482862326451903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0314_text_document -0.00028103519882799385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0315_text_document -0.00027510038584601916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0316_text_document -0.00028413351954904745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0317_text_document -0.0002766838847779375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0318_text_document -0.00026734717208098886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0319_text_document -0.0002798212098651715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0320_text_document -0.0002747771651023886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0321_text_document -0.0002653649112010507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0322_text_document -0.0002631895073950362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0323_text_document -0.00027233897055462913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0324_text_document -0.00026295942114759743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0325_text_document -0.00030523368071333024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0326_text_document -0.00022951852300606208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0327_text_document -0.00022441558532523096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0328_text_document -0.00022508048810748277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0329_text_document -0.00021854625167048365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0330_text_document -0.00032578339433634126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0331_text_document -0.0003234065091465547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0332_text_document -0.00031578848940780525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0333_text_document -0.0003211733834987297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0334_text_document -0.00030598592011548813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0335_text_document -0.00030636342203205056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0336_text_document -0.0003057832116313887 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0337_text_document -0.000314036788141844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0338_text_document -0.00030966829419359915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0339_text_document -0.00030590256959722885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0340_text_document -0.0003098044211320355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0341_text_document -0.00031610551467687426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0342_text_document -0.0003181946275637243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0343_text_document -0.00030594263323826957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0344_text_document -0.0003126680759448145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0345_text_document -0.0002992280964722656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0346_text_document -0.00029925238994904177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0347_text_document -0.0003002679127100512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0348_text_document -0.00029525568123898354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0349_text_document -0.0003024653097967333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0350_text_document -0.0002953978348393056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0351_text_document -0.0003002611325611784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0352_text_document -0.0002957202302765213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0353_text_document -0.00029316969879070013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0354_text_document -0.00029927093466316167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0355_text_document -0.00029673566591636904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0356_text_document -0.0002937689672539696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0357_text_document -0.0002973606684406085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0358_text_document -0.0002964111065178358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0359_text_document -0.0003023024169175062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0360_text_document -0.0003023653161749783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0361_text_document -0.0003041586406248139 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0362_text_document -0.00029561553630767535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0363_text_document -0.00024185982713467274 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0364_text_document -0.00023843085692504566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0365_text_document -0.00024640440430345615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0366_text_document -0.0002514283272863322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0367_text_document -0.0002428429062712565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0368_text_document -0.00023806417358106035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0369_text_document -0.000241345504518809 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0370_text_document -0.00023475737093303525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0371_text_document -0.00024315922889458298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0372_text_document -0.0002509834540572025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0373_text_document -0.00025303820591366467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0374_text_document -0.00023678822937901864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0375_text_document -0.00023171129872234371 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0376_text_document -0.00024461347186013167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0377_text_document -0.00023799008209254456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0378_text_document -0.00023090419051131675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0379_text_document -0.0002236725770641727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0380_text_document -0.00023567214707890686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0381_text_document -0.0002262722125540663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0382_text_document -0.00034312492202384507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0383_text_document -0.00021814471912144287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0384_text_document -0.00023259303719099642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0385_text_document -0.00031953022508126173 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0386_text_document -0.00023554778297810253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0387_text_document -0.0002460294175476815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0388_text_document -0.0002407153820838108 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0389_text_document -0.0002374237316074476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0390_text_document -0.00023980889380119253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0391_text_document -0.0002511495625217406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0392_text_document -0.0002455758117178104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0393_text_document -0.00024203242698955926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0394_text_document -0.00024139601603558614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0395_text_document -0.00024286894291167163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0396_text_document -0.00023208951019510916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0397_text_document -0.0002357404012027918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0398_text_document -0.00023446333528494393 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0399_text_document -0.0002366761658977476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0400_text_document -0.0002382598783135322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0401_text_document -0.00023065268726624828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0402_text_document -0.00022821836479753894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0403_text_document -0.00023184541693801962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0404_text_document -0.00023323789396160382 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0405_text_document -0.00022765013762883577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0406_text_document -0.00023714308028716352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0407_text_document -0.00028689301916209046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0408_text_document -0.0003409253474017267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0409_text_document -0.0003375051344730567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0410_text_document -0.0003292176313040109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0411_text_document -0.00032955022485317955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0412_text_document -0.0003279397699428092 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0413_text_document -0.0003197789907967984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0414_text_document -0.00031901270687106177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0415_text_document -0.000321273794216131 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0416_text_document -0.0003220857325921838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0417_text_document -0.00031002969769902754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0418_text_document -0.00031282247512778876 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0419_text_document -0.0003087408247659614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0420_text_document -0.0003000588357430778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0421_text_document -0.0003050525128747414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0422_text_document -0.0003038755807622741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0423_text_document -0.00029692774685276133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0424_text_document -0.0003116160903862434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0425_text_document -0.00031084101832927995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0426_text_document -0.00030708350656830715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0427_text_document -0.00031743538194191725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0428_text_document -0.00031694261996253895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0429_text_document -0.0003146446823405206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0430_text_document -0.00030156651655858596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0431_text_document -0.000303240651608455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0432_text_document -0.00032558453868072364 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0433_text_document -0.0002973680179620588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0434_text_document -0.0002971760577119216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0435_text_document -0.0002973002298006474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0436_text_document -0.0002878620791957177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0437_text_document -0.00029632190555443135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0438_text_document -0.0002946733596926658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0439_text_document -0.00029877307004917556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0440_text_document -0.00029551091884749816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0441_text_document -0.0002976670701108049 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0442_text_document -0.0002888352867396029 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0443_text_document -0.0002866799361024954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0444_text_document -0.0002859222006630905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0445_text_document -0.00028581831052887173 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0446_text_document -0.00028506927387831265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0447_text_document -0.0002803249093757669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0448_text_document -0.0002809203104492272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0449_text_document -0.00028454145587367076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0450_text_document -0.00028584177277598123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0451_text_document -0.00028086934160805217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0452_text_document -0.000270936293938279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0453_text_document -0.00028304258342716634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0454_text_document -0.00028276074943094315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0455_text_document -0.0002602100764561298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0456_text_document -0.00028012504824815937 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0457_text_document -0.0002608944608134916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0458_text_document -0.0002845289889094832 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0459_text_document -0.0002717532367216808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0460_text_document -0.0002643974553814476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0461_text_document -0.0002758213344366294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0462_text_document -0.0002753861114186629 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0463_text_document -0.00031845649723981725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0464_text_document -0.00032153756772406746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0465_text_document -0.0003223378422301534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0466_text_document -0.0002996787108131847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0467_text_document -0.00030486709979224023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0468_text_document -0.00031053773722556385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0469_text_document -0.0003002771838331003 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0470_text_document -0.00029794449770130684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0471_text_document -0.0003033670930430196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0472_text_document -0.0002965031647098184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0473_text_document -0.0002837085032811094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0474_text_document -0.0002828420727162801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0475_text_document -0.00028941167269403106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0476_text_document -0.00029157564190928313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0477_text_document -0.00029812762761704826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0478_text_document -0.0002961388642406645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0479_text_document -0.0002838466433847451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0480_text_document -0.0002788779144959817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0481_text_document -0.0003402152386086791 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0482_text_document -0.00037332501068667467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0483_text_document -0.0002413675200116708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0484_text_document -0.0003704235275199961 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0485_text_document -0.0002379466982220781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0486_text_document -0.00035089333509974934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0487_text_document -0.00023630817154070126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0488_text_document -0.00023857309295728839 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0489_text_document -0.0002435822475458576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0490_text_document -0.00023387703405383536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0491_text_document -0.00034319854187343774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0492_text_document -0.0003622737409420836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0493_text_document -0.00023570573166970698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0494_text_document -0.00022641527241191097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0495_text_document -0.00034243292431352653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0496_text_document -0.00024045245535407698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0497_text_document -0.00023676532885361976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0498_text_document -0.00022335363118071338 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0499_text_document -0.00023448598925498735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0500_text_document -0.00033737048365832474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0501_text_document -0.0003357751601882351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0502_text_document -0.0003383236392673138 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0503_text_document -0.0003397838415177592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0504_text_document -0.00033705937300296186 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0505_text_document -0.00033448155827902774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0506_text_document -0.00034576892094196856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0507_text_document -0.00033674871522955814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0508_text_document -0.0003328110361659434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0509_text_document -0.00032432631363958473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0510_text_document -0.00032731656932112217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0511_text_document -0.00032024116066153716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0512_text_document -0.0003040305172335454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0513_text_document -0.00031659687802842567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0514_text_document -0.000303687860573204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0515_text_document -0.0003155611705529593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0516_text_document -0.00030697272991348575 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0517_text_document -0.00032874805540012775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0518_text_document -0.0003195460475675836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0519_text_document -0.00029999019685462926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0520_text_document -0.0003031992730055188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0521_text_document -0.0003004957313392662 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0522_text_document -0.00029242533089655584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0523_text_document -0.0002940539652538529 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0524_text_document -0.0003042748602544184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0525_text_document -0.00029329988520120374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0526_text_document -0.00028533980088048884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0527_text_document -0.0002995523399640371 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0528_text_document -0.00024445982369612285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0529_text_document -0.0002341949821161716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0530_text_document -0.0002448827406649086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0531_text_document -0.0002464661023748273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0532_text_document -0.0002458273043503861 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0533_text_document -0.000234131092194839 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0534_text_document -0.00023502842288340058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0535_text_document -0.00023472409854696446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0536_text_document -0.0002353934437680525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0537_text_document -0.00023298716740292522 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0538_text_document -0.00023724345571185632 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0539_text_document -0.0002463911915031484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0540_text_document -0.00023298903026561056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0541_text_document -0.00022884149754863258 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0542_text_document -0.00023103945956545342 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0543_text_document -0.0002444088792883614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0544_text_document -0.00022581722858094737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0545_text_document -0.0002370810502668904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0546_text_document -0.00022632319324174496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0547_text_document -0.00023710168144645038 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0548_text_document -0.00022964923090952467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0549_text_document -0.00023748320722538985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0550_text_document -0.0003222624777361089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0551_text_document -0.0002939065142920207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0552_text_document -0.0003163669341858318 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0553_text_document -0.0002875568128154461 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0554_text_document -0.0002891174847690085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0555_text_document -0.0002845830978145091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0556_text_document -0.0002834617830618547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0557_text_document -0.0002825955578364204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0558_text_document -0.0002770681818983043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0559_text_document -0.00027398693963975244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0560_text_document -0.00026761486776881346 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0561_text_document -0.0002709662939745425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0562_text_document -0.0002715205476986883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0563_text_document -0.0002694875173937183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0564_text_document -0.0002691404382855153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0565_text_document -0.0002555696578650904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0566_text_document -0.00025938400199289785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0567_text_document -0.00025330279781755557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0568_text_document -0.00025455190919542185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0569_text_document -0.0002596474980952091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0570_text_document -0.0002593765878092823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0571_text_document -0.00026530976177812846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0572_text_document -0.00026521586959931293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0573_text_document -0.00027156192778243744 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0574_text_document -0.00026542489893346987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0575_text_document -0.0002637742757379441 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0576_text_document -0.0002660391549513622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0577_text_document -0.0002622961692249776 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0578_text_document -0.0002668259130904866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0579_text_document -0.00026393281403990296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0580_text_document -0.0002573087912247817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0581_text_document -0.0002689284845925933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0582_text_document -0.0002587878565641303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0583_text_document -0.0002591277179432351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0584_text_document -0.00025645748667058553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0585_text_document -0.0002576834953920859 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0586_text_document -0.0002574007659976351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0587_text_document -0.00026215195926907863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0588_text_document -0.0002550452573299244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0589_text_document -0.0002580549425113166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0590_text_document -0.0002580184320809385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0591_text_document -0.00026135902243793944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0592_text_document -0.0002499110939933153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0593_text_document -0.00023602977130289638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0594_text_document -0.0002179537404034863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0595_text_document -0.000217790844069029 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0596_text_document -0.00021511798361299487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0597_text_document -0.00025422459968044684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0598_text_document -0.00026310640293852807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0599_text_document -0.0003408740036680742 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0600_text_document -0.00025777786217145044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0601_text_document -0.00025244460970438263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0602_text_document -0.00025351648924446906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0603_text_document -0.0003423231978018855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0604_text_document -0.0003423953052478566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0605_text_document -0.0003318569148201118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0606_text_document -0.00032767638761629247 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0607_text_document -0.00033215390937927025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0608_text_document -0.00032618622802635676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0609_text_document -0.00032507622347617733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0610_text_document -0.00031030763419557833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_head-0611_text_document -0.00024643590119480534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0000_text_document -0.0002095902169870633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0001_text_document -0.00021403593088797332 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0002_text_document -0.0002227102409085263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0003_text_document -0.00020197706221244385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0004_text_document -0.00022874875522106917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0005_text_document -0.00023280550472601052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0006_text_document -0.00022749578163623905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0007_text_document -0.00023802912323224644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0008_text_document -0.00023176496190267302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0009_text_document -0.0002278986856648695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0010_text_document -0.00021833909531790053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0011_text_document -0.0003080057114591217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0012_text_document -0.00021694016663911526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0013_text_document -0.0002668830492707773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0014_text_document -0.00024523658363304193 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0015_text_document -0.0002894756615830288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0016_text_document -0.00020347856162111349 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0017_text_document -0.0002134325832786435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0018_text_document -0.00021673235231198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0019_text_document -0.0002654127125833355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0020_text_document -0.0002158672209137081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0021_text_document -0.00023947604851382316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0022_text_document -0.00026152140024106367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0023_text_document -0.00021518621527788343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0024_text_document -0.0002439782139658387 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0025_text_document -0.0002905141391659118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0026_text_document -0.00021642682185339982 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0027_text_document -0.00019960430947798375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0028_text_document -0.00026322267340937706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0029_text_document -0.00022334429465509248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0030_text_document -0.00022855119280875728 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0031_text_document -0.00028578658731994404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0032_text_document -0.0002584277862839571 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0033_text_document -0.00021861958226794765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0034_text_document -0.00026614391185475836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0035_text_document -0.00028970533715167736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0036_text_document -0.0002235814952215254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0037_text_document -0.00022032188312044515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0038_text_document -0.00022884461811511293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0039_text_document -0.0002551680347396578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0040_text_document -0.00022883355545520197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0041_text_document -0.0002232938120141678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0042_text_document -0.0002691617763064546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0043_text_document -0.00023572139842386745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0044_text_document -0.0002552819803341825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0045_text_document -0.00027155660031106415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0046_text_document -0.00021551548292117663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0047_text_document -0.00020620735756494168 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0048_text_document -0.0002166820604491231 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0049_text_document -0.00018501398539579828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0050_text_document -0.00027225222848112053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0051_text_document -0.00023371832644559636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0052_text_document -0.00023566702124489628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0053_text_document -0.00023686334707090557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0054_text_document -0.00022423975285568458 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0055_text_document -0.0002528257228301147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0056_text_document -0.0002561855163693918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0057_text_document -0.00022810786925037496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0058_text_document -0.0002762405538154904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0059_text_document -0.00022261162863844723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0060_text_document -0.00022540915157909426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0061_text_document -0.00022299985657677767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0062_text_document -0.00022755525774778565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0063_text_document -0.00024165856540482104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0064_text_document -0.00025687628451136137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0065_text_document -0.0002231870244226192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0066_text_document -0.00026580529164370396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0067_text_document -0.00028870521089646587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0068_text_document -0.00021540624754582923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0069_text_document -0.00025778332069476944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0070_text_document -0.00021926796929661694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0071_text_document -0.00026029886649394187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0072_text_document -0.00022285796310592967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0073_text_document -0.00023080628286139754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0074_text_document -0.00025245808263416443 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0075_text_document -0.00022457772027503216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0076_text_document -0.00024435224362284627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0077_text_document -0.00022526086938759533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0078_text_document -0.0002673487094116284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0079_text_document -0.00023263089713557213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0080_text_document -0.00021778225362633044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0081_text_document -0.00021409630017652816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0082_text_document -0.00022267016739539933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0083_text_document -0.00020585884947224638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0084_text_document -0.00022993683686780696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0085_text_document -0.00024242353683668374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0086_text_document -0.0002092411836993767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0087_text_document -0.0002197488902020793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0088_text_document -0.00021875038642425168 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0089_text_document -0.0002494827261520774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0090_text_document -0.00023601123399284122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0091_text_document -0.00021826172481591926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0092_text_document -0.000236632672200321 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0093_text_document -0.00025074570040713444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0094_text_document -0.00022642420961164095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0095_text_document -0.00023812142057551977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0096_text_document -0.0002428821562055837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0097_text_document -0.00022488741946885592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0098_text_document -0.00020317409833506262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0099_text_document -0.00021856439903312987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0100_text_document -0.0002106925714107645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0101_text_document -0.00021119826681040816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0102_text_document -0.0002592340274790045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0103_text_document -0.00023255611509461946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0104_text_document -0.00020894883617804318 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0105_text_document -0.00022615604129768463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0106_text_document -0.000203728797783905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0107_text_document -0.0001989690950208705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0108_text_document -0.00021734356057002846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0109_text_document -0.0002433390106922548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0110_text_document -0.00022031295850762523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0111_text_document -0.00022344289507866802 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0112_text_document -0.00022230083290263739 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0113_text_document -0.00021439002065826426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0114_text_document -0.0002041951415667326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0115_text_document -0.00022877491032651992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0116_text_document -0.00021999090587860643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0117_text_document -0.00025682432698074305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0118_text_document -0.00024400030399295212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0119_text_document -0.00022789294060424558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0120_text_document -0.00021497724986548528 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0121_text_document -0.00023813142494777905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0122_text_document -0.00021895635220322673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0123_text_document -0.00023328497887722523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0124_text_document -0.00022164528342855325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0125_text_document -0.0002484042811809953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0126_text_document -0.00021121568758750245 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0127_text_document -0.00020558498767931708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0128_text_document -0.00024543621326022564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0129_text_document -0.00019902438240619879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0130_text_document -0.00023691721805865155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0131_text_document -0.00021791494779355714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0132_text_document -0.0002240264291639859 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0133_text_document -0.0002473539109425455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0134_text_document -0.0002071473371471445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0135_text_document -0.00021022258828332134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0136_text_document -0.00022311670653909265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0137_text_document -0.00022930107525031038 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0138_text_document -0.0002214421423002716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0139_text_document -0.00021570132519262982 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0140_text_document -0.0002197681200389886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0141_text_document -0.0002800029152388595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0142_text_document -0.00026843440765131945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0143_text_document -0.0002849765317975514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0144_text_document -0.00027096319463304773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0145_text_document -0.00027086227426919104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0146_text_document -0.0002526247335698449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0147_text_document -0.00027363240217034764 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0148_text_document -0.0002623467059155748 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0149_text_document -0.00027346078063921375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0150_text_document -0.00025920642956814055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0151_text_document -0.00025705335691494745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0152_text_document -0.00025922805782841715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0153_text_document -0.0002788336705199961 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0154_text_document -0.00024845909125095083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0155_text_document -0.00028656519284339746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0156_text_document -0.00025647131598268287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0157_text_document -0.0002784068234736532 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0158_text_document -0.0002528120161786896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0159_text_document -0.0002488190053053583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0160_text_document -0.0002704389893183884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0161_text_document -0.00025616941425622545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0162_text_document -0.00026029019534693783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0163_text_document -0.00025685556571703545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0164_text_document -0.00019723833812640722 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0165_text_document -0.0001895418580073486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0166_text_document -0.00019011078486016846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0167_text_document -0.00018779376696334834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0168_text_document -0.00018563641007150188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0169_text_document -0.00018754827458482748 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0170_text_document -0.00019755194962803275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0171_text_document -0.00028610572842390993 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0172_text_document -0.00019902354772130188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0173_text_document -0.00020283251106846995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0174_text_document -0.00018722834815639619 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0175_text_document -0.00018348325202476222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0176_text_document -0.0002739432916909774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0177_text_document -0.00014534657139819037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0178_text_document -0.00015282753276716084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0179_text_document -0.0001549244865585569 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0180_text_document -0.0001465220076427807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0181_text_document -0.00015309131688759006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0182_text_document -0.0001462273984264752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0183_text_document -0.00014903597785697923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0184_text_document -0.0001547302246314982 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0185_text_document -0.0001486478323505694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0186_text_document -0.00014887945296702178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0187_text_document -0.00014582128695700495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0188_text_document -0.00015040846513981096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0189_text_document -0.0001492663985213415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0190_text_document -0.0001491503509128408 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0191_text_document -0.00014485595166153977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0192_text_document -0.00014471245274265675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0193_text_document -0.0001539836098505113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0194_text_document -0.00014269340600113259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0195_text_document -0.0001366015589763494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0196_text_document -0.00014275967558886846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0197_text_document -0.00012216291308335102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0198_text_document -9.860253447438225e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0199_text_document -0.00013395002197992724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0200_text_document -0.00013095775634161855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0201_text_document -0.00013244501748701574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0202_text_document -0.00013344638268905827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0203_text_document -0.00013599432127141194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0204_text_document -0.0001319495730149868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0205_text_document -0.0001286425479982177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0206_text_document -0.0001288175023456875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0207_text_document -0.00014061678080985136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0208_text_document -0.000128553766351679 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0209_text_document -0.00013865417327932483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0210_text_document -0.00012918889813006947 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0211_text_document -0.00013369372633056305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0212_text_document -0.00012818148109232114 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0213_text_document -0.00013087168186794624 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0214_text_document -0.00012209941459024034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0215_text_document -0.0001170049632015973 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0216_text_document -0.00013033065279061172 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0217_text_document -0.00012782387759971287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0218_text_document -0.00012594444140907917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0219_text_document -0.00012747350244869554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0220_text_document -0.00011189052700824495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0221_text_document -0.000118474284791765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0222_text_document -0.00012947220948400783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0223_text_document -0.00011563584378100779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0224_text_document -0.00012898102925965738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0225_text_document -0.000122859118523654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0226_text_document -0.00013841949453733798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0227_text_document -0.00012735223374055142 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0228_text_document -0.00013005120882648248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0229_text_document -0.000133953509788018 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0230_text_document -0.00012898361006981912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0231_text_document -0.00012385687424414202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0232_text_document -0.00012495169231715962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0233_text_document -0.0001334287109141697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0234_text_document -0.0001251557347669207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0235_text_document -0.00012458204389205325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0236_text_document -0.00013142493999218836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0237_text_document -0.0001234876747521603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0238_text_document -0.00011414056156548952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0239_text_document -0.00023536944102421793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0240_text_document -0.00020899836320101376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0241_text_document -0.00020694945512603853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0242_text_document -0.0001985515975806629 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0243_text_document -0.00020332234597425947 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0244_text_document -0.00019901014809176087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0245_text_document -0.00019730742496077176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0246_text_document -0.0002086531104287768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0247_text_document -0.00019880240459684486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0248_text_document -0.0001934729054969894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0249_text_document -0.00020006177554040137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0250_text_document -0.0001941325758266985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0251_text_document -0.00020329878081065027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0252_text_document -0.00020327608562464652 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0253_text_document -0.00019798005487177493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0254_text_document -0.0001954984594242001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0255_text_document -0.0001990223203741723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0256_text_document -0.00019108660381768295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0257_text_document -0.00019716779886134537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0258_text_document -0.0001928475026596504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0259_text_document -0.00019634937526499807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0260_text_document -0.00019298574642019224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0261_text_document -0.00018884134414178089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0262_text_document -0.00018997833083144106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0263_text_document -0.0001905325885044214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0264_text_document -0.00020263821458910917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0265_text_document -0.0002079379871094917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0266_text_document -0.00019785431238092052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0267_text_document -0.00018722610077594935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0268_text_document -0.00019937636744768995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0269_text_document -0.00018558334637361332 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0270_text_document -0.00019000469868035166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0271_text_document -0.0001853064471865308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0272_text_document -0.000189466635918149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0273_text_document -0.00019109828052136198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0274_text_document -0.00018290456266579745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0275_text_document -0.00017877060456109023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0276_text_document -0.00018344271945962216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0277_text_document -0.0001937669621232641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0278_text_document -0.00019434311583686195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0279_text_document -0.0001805150932807986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0280_text_document -0.0001914582846585569 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0281_text_document -0.00020025771498172507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0282_text_document -0.00019924956568197525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0283_text_document -0.000189496868442045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0284_text_document -0.0001929642820365483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0285_text_document -0.0001903124937955297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0286_text_document -0.00019497565890742164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0287_text_document -0.00018960064504727124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0288_text_document -0.00018568951646616373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0289_text_document -0.00018239686989629257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0290_text_document -0.00018605553146990633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0291_text_document -0.0001844096767388669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0292_text_document -0.00017898307999377337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0293_text_document -0.0001739406120499752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0294_text_document -0.0001911537409150027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0295_text_document -0.00017663348174413226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0296_text_document -0.00017913373123918278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0297_text_document -0.00017455805527093036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0298_text_document -0.00017536417503931625 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0299_text_document -0.00017329247651270448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0300_text_document -0.00017912565587258707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0301_text_document -0.00017228776664782256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0302_text_document -0.0001825947205735245 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0303_text_document -0.0001696263054898423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0304_text_document -0.00017175867341643253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0305_text_document -0.0001668734295531042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0306_text_document -0.00016312507834781404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0307_text_document -0.0001687262224636195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0308_text_document -0.00017236097186979052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0309_text_document -0.0002586993024691808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0310_text_document -0.00026219934972577114 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0311_text_document -0.0002566784476550503 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0312_text_document -0.0002530671575343629 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0313_text_document -0.00025526495987018773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0314_text_document -0.0002510505062545801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0315_text_document -0.00024743741398453804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0316_text_document -0.00024882602559273036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0317_text_document -0.00024230881628338428 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0318_text_document -0.00025005854915078414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0319_text_document -0.00024477471955617643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0320_text_document -0.0002480463985551468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0321_text_document -0.00024335328103980772 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0322_text_document -0.00024464696562773777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0323_text_document -0.00023820565587951385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0324_text_document -0.00024537554558786237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0325_text_document -0.00024052017934692743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0326_text_document -0.00023660347377746528 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0327_text_document -0.00023823292504990384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0328_text_document -0.00023564543049854766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0329_text_document -0.0002370415962271789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0330_text_document -0.00023453319757168757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0331_text_document -0.000236480621339876 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0332_text_document -0.0002391149628895737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0333_text_document -0.00023165934662137285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0334_text_document -0.00023331169915961683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0335_text_document -0.0002348226454144718 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0336_text_document -0.00023564045570745751 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0337_text_document -0.00016411316830860297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0338_text_document -0.0002007359738791159 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0339_text_document -0.00019930606930833604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0340_text_document -0.00019598670739211644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0341_text_document -0.00019115600211637036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0342_text_document -0.00018957338451495675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0343_text_document -0.0001997256344570198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0344_text_document -0.0001924339501051294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0345_text_document -0.0001929492409258573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0346_text_document -0.00019129356692417672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0347_text_document -0.0001927097658307402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0348_text_document -0.00018744016832935095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0349_text_document -0.00018898826127054628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0350_text_document -0.00019337725386559253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0351_text_document -0.00018434878571055096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0352_text_document -0.00018454731188528818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0353_text_document -0.00018197801455061398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0354_text_document -0.00018615322144032256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0355_text_document -0.00017981075274777777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0356_text_document -0.00018028813451030057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0357_text_document -0.0001760055343765487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0358_text_document -0.00018306121836089844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0359_text_document -0.00018110213343756692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0360_text_document -0.00017839531596627688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0361_text_document -0.00017668405792307465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0362_text_document -0.00018382867977972885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0363_text_document -0.00017812146256462094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0364_text_document -0.00017866992260811773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0365_text_document -0.00017457542446637375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0366_text_document -0.00017144357690622488 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0367_text_document -0.00017669299438239817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0368_text_document -0.00017721730286035934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0369_text_document -0.0002573630336497748 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0370_text_document -0.00025158500395961657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0371_text_document -0.00025871208953576674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0372_text_document -0.0002522219361597465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0373_text_document -0.00025035546177162626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0374_text_document -0.00024714234522261514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0375_text_document -0.00024296206951019436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0376_text_document -0.00023797488747091152 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0377_text_document -0.0002417964809184933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0378_text_document -0.0002317015633644362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0379_text_document -0.00023529081059722227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0380_text_document -0.00022865050303533797 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0381_text_document -0.00022350627510674308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0382_text_document -0.00022416076407195612 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0383_text_document -0.0002237152481700081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0384_text_document -0.00022673308251184112 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0385_text_document -0.00021988509315558021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0386_text_document -0.00021791186375379613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0387_text_document -0.00021902394687174658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0388_text_document -0.00022390913224296354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0389_text_document -0.0002159569838456253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0390_text_document -0.000193074631476835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0391_text_document -0.00019772710141722856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0392_text_document -0.0001918863050023569 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0393_text_document -0.0001968641761834432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0394_text_document -0.00019269495646727515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0395_text_document -0.0001986463032193898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0396_text_document -0.0001855871337995234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0397_text_document -0.00019041152711008963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0398_text_document -0.00018277849340888642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0399_text_document -0.00018810546599505484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0400_text_document -0.00018711834399232793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0401_text_document -0.000180125082690484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0402_text_document -0.00023744084906469025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0403_text_document -0.00023803845013258319 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0404_text_document -0.00023586547263857976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0405_text_document -0.00023222402329423718 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0406_text_document -0.00023270999204422837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0407_text_document -0.00023378783679246331 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0408_text_document -0.00017304047941651873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0409_text_document -0.00017585076104150352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0410_text_document -0.00017101296884180275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0411_text_document -0.00017561096140154923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0412_text_document -0.0001713420333669203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0413_text_document -0.00022603582939637927 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0414_text_document -0.0001703733924033566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0415_text_document -0.0002396801442728503 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0416_text_document -0.0001676400523382032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0417_text_document -0.00017193438273170229 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0418_text_document -0.00017529021040710947 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0419_text_document -0.0001630741415909194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0420_text_document -0.00024179471702347313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0421_text_document -0.00016581358754145113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0422_text_document -0.0002456894490564403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0423_text_document -0.0002456073517995372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0424_text_document -0.00024937580109172706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0425_text_document -0.0002457208726475487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0426_text_document -0.00024399607429757567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0427_text_document -0.00023977003702270238 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0428_text_document -0.0002453131498067917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0429_text_document -0.0001621090466807557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0430_text_document -0.00024557101413066944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0431_text_document -0.00024662307150866836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0432_text_document -0.00015758980646827074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0433_text_document -0.00024391288666874046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0434_text_document -0.00023509503922816786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0435_text_document -0.00023489539281843744 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0436_text_document -0.00023286637378893443 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0437_text_document -0.00023379369093964089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0438_text_document -0.00023205784424428202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0439_text_document -0.00023009948269807432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0440_text_document -0.00023187584394201576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0441_text_document -0.00023202252759594008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0442_text_document -0.00022728777233539934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0443_text_document -0.00022582666382743133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0444_text_document -0.00022616733175598707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0445_text_document -0.00022768677294110565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0446_text_document -0.00022367789565066836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0447_text_document -0.00022752055218158585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0448_text_document -0.00021819243338256605 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0449_text_document -0.0002241455531613807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0450_text_document -0.00022437797440403226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0451_text_document -0.00022445007197791702 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0452_text_document -0.00022150502971124016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0453_text_document -0.0002225145672731263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0454_text_document -0.00022368982014371355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0455_text_document -0.00022402755606263736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0456_text_document -0.00023016090138940315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0457_text_document -0.0002260342841680707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0458_text_document -0.00022458279279977673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0459_text_document -0.00021839974448010203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0460_text_document -0.0002264409368746725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0461_text_document -0.000223550215762877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0462_text_document -0.00021610601829010048 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0463_text_document -0.00022408120517524368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0464_text_document -0.00021671066876802013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0465_text_document -0.00016072298972169641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0466_text_document -0.0001722371396276357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0467_text_document -0.00017303760343097654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0468_text_document -0.00016820735177759604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0469_text_document -0.0001782239553050235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0470_text_document -0.0001749477598265696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0471_text_document -0.0001700037698924768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0472_text_document -0.0001721297434219665 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0473_text_document -0.00017082606704868714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0474_text_document -0.00017400024710211123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0475_text_document -0.00017016210162102983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0476_text_document -0.00016745166973214216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0477_text_document -0.0001684428163376526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0478_text_document -0.0001648685852885396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0479_text_document -0.00017387645508870812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0480_text_document -0.00016594906144137858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0481_text_document -0.00016042654972698604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0482_text_document -0.00014860104507835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0483_text_document -0.00016227281398002708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0484_text_document -0.00016502091577582913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0485_text_document -0.00016106235650927743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0486_text_document -0.00015987309712264371 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0487_text_document -0.0001642815421701454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0488_text_document -0.00016531915249024665 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0489_text_document -0.00015833872193897052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0490_text_document -0.00015639158495488916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0491_text_document -0.00015342548972376501 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0492_text_document -0.00015518921543764528 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0493_text_document -0.0001621958240469728 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0494_text_document -0.00015155749799598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0495_text_document -0.00014939896262383117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0496_text_document -0.00015490092509698006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0497_text_document -0.00017977881778259884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0498_text_document -0.00018596378104021417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0499_text_document -0.00017898738743182946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0500_text_document -0.00018286541046512472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0501_text_document -0.00018092409134830376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0502_text_document -0.00017788220095337013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0503_text_document -0.00017903547090898037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0504_text_document -0.0001797342122414524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0505_text_document -0.00018405110997743763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0506_text_document -0.00016587458814992502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0507_text_document -0.00018323507493237133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0508_text_document -0.00017881236669457928 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0509_text_document -0.00017083385044833047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0510_text_document -0.0001730201559992492 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0511_text_document -0.00016901593018907565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0512_text_document -0.00017121838351155997 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0513_text_document -0.0001762157419442059 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0514_text_document -0.00017000047903250774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0515_text_document -0.00017628842147757824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0516_text_document -0.0001760014416563697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0517_text_document -0.00017080626611158523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0518_text_document -0.00017077485831581488 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0519_text_document -0.0001740210774510124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0520_text_document -0.00017310752988628116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0521_text_document -0.00016563538206915967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0522_text_document -0.0001698038028867437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0523_text_document -0.00022989652913943246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0524_text_document -0.00023802118237282655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0525_text_document -0.00023209291976691602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0526_text_document -0.00023478978296678473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0527_text_document -0.00023185674392304132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0528_text_document -0.0002223151271899996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0529_text_document -0.0002212980337800594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0530_text_document -0.0002177142043482363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0531_text_document -0.00022071160791386127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0532_text_document -0.0002155092901614389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0533_text_document -0.00021709946336410436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0534_text_document -0.0002091085371649664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0535_text_document -0.00021301299764538067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0536_text_document -0.00020514046046681228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0537_text_document -0.00020554350961511138 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0538_text_document -0.0002032929572669402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0539_text_document -0.00020017696773262392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0540_text_document -0.0002041760983122544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0541_text_document -0.00019610775249750582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0542_text_document -0.0001972797535028649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0543_text_document -0.00019987201182946655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0544_text_document -0.00023221090921479249 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0545_text_document -0.00022866265656078542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0546_text_document -0.00022846213721182363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0547_text_document -0.00022028779604045222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0548_text_document -0.00023019534411130514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0549_text_document -0.00021499063838892918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0550_text_document -0.0002238747556640398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0551_text_document -0.000219139079337847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0552_text_document -0.00022466810662919942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0553_text_document -0.00021354111452743537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0554_text_document -0.0002116352569318229 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0555_text_document -0.00021742490236552721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0556_text_document -0.00020976053145397075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0557_text_document -0.0002121893598598504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0558_text_document -0.00020611700008662688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0559_text_document -0.00020771394257887023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0560_text_document -0.00020861778045311834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0561_text_document -0.00020549717473124685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0562_text_document -0.00021168253336591858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0563_text_document -0.00020292362079976103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0564_text_document -0.0002053579978117472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0565_text_document -0.0002025742316233632 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0566_text_document -0.00019721191770863706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0567_text_document -0.00020263891920926902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0568_text_document -0.0002047513235561355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0569_text_document -0.0002058192920224309 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0570_text_document -0.00020762611235464895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0571_text_document -0.00020536767369033477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0572_text_document -0.000208726602681654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0573_text_document -0.00020670689006790867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0574_text_document -0.0001987029852837105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0575_text_document -0.00019743671572624558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0576_text_document -0.00020347237873346202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0577_text_document -0.00019483561225711876 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0578_text_document -0.00019876706376189147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0579_text_document -0.00019418407035646924 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0580_text_document -0.00019094739234588127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0581_text_document -0.00018896169178427298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0582_text_document -0.00019336957140803166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0583_text_document -0.00019246034436187084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0584_text_document -0.00019234601030075014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0585_text_document -0.00018937638801999214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0586_text_document -0.00019243149393005724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0587_text_document -0.00018564518487541217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0588_text_document -0.00018349694905090308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0589_text_document -0.00018632405912780405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0590_text_document -0.0001859374743982387 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0591_text_document -0.00018735943662878573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0592_text_document -0.00018429223346416512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0593_text_document -0.00018743951405683122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0594_text_document -0.0002231790070545305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0595_text_document -0.00023691491440731282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0596_text_document -0.00022732583835977663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0597_text_document -0.00023280690754947414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0598_text_document -0.00023098339919576762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0599_text_document -0.00022742109041848038 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0600_text_document -0.00023387941495424947 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0601_text_document -0.00022226509841824269 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0602_text_document -0.00022342786655488707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0603_text_document -0.00022237713376406775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0604_text_document -0.00021379459835981835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0605_text_document -0.00021934823034546768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0606_text_document -0.00022299117012803982 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0607_text_document -0.0002249652818475372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0608_text_document -0.00021549803647665793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0609_text_document -0.00021082391557018925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0610_text_document -0.0002063290532408184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0611_text_document -0.0002098859538424268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0612_text_document -0.00020927123951292785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0613_text_document -0.00020988114416198002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0614_text_document -0.00020708947339409333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0615_text_document -0.00020681735599881374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0616_text_document -0.00020862989695824213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0617_text_document -0.00017921617624032585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0618_text_document -0.0001869630178204498 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0619_text_document -0.0001837421970952879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0620_text_document -0.0002703540624747488 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0621_text_document -0.0002773524903329593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0622_text_document -0.00026751943505093036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0623_text_document -0.00026849089128670544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0624_text_document -0.00017768273890485142 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0625_text_document -0.00026694167218726514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0626_text_document -0.00026851367038544275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0627_text_document -0.00017178448275206052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0628_text_document -0.00026146356857229295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0629_text_document -0.0002631494175929668 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0630_text_document -0.00026756049947472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0631_text_document -0.0002600735435281443 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0632_text_document -0.00026162102069795645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0633_text_document -0.0002546230805208093 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0634_text_document -0.00025384118907342997 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0635_text_document -0.00024898898905737453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0636_text_document -0.0002560021645785107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0637_text_document -0.00025001876340897294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0638_text_document -0.00024817567624010623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0639_text_document -0.00025419118513633326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0640_text_document -0.00025520008446783997 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0641_text_document -0.00024355226527934937 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0642_text_document -0.00024233948860872504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0643_text_document -0.00024413553528635867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0644_text_document -0.00024287456234999737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0645_text_document -0.0002471744870080021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0646_text_document -0.00024318841473052868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0647_text_document -0.00024268080340573577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0648_text_document -0.000242363177173413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0649_text_document -0.00025310826613573865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0650_text_document -0.0002450433802404371 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0651_text_document -0.0002429196089265994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0652_text_document -0.00023818874203405117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0653_text_document -0.00023814010078402416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0654_text_document -0.0002258262625271231 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0655_text_document -0.0002359106231188901 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0656_text_document -0.00023984369117779496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0657_text_document -0.00022677878582898447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0658_text_document -0.00023019334994987196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0659_text_document -0.0002326106169086802 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0660_text_document -0.00023296218608853588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0661_text_document -0.00021930251468821644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0662_text_document -0.00022685746290158792 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0663_text_document -0.00022204375118840136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0664_text_document -0.00022312982876300855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0665_text_document -0.00022347955655196657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0666_text_document -0.00021968416238742178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0667_text_document -0.00022148339454050315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0668_text_document -0.00022133417129237745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0669_text_document -0.00021840518548046784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0670_text_document -0.00021501258675160414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0671_text_document -0.00016302293581967305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0672_text_document -0.00018778970953587786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0673_text_document -0.00018910837173616491 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0674_text_document -0.00019201324078164315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0675_text_document -0.00018432054093123207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0676_text_document -0.0001907749590824511 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0677_text_document -0.00017971831966331778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0678_text_document -0.00018425384289495064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0679_text_document -0.00018353447605936826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0680_text_document -0.00018776194922919426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0681_text_document -0.000181858547251418 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0682_text_document -0.00017663862855632625 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0683_text_document -0.00017879513620194847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0684_text_document -0.00017779569087388971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0685_text_document -0.00017180299093946108 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0686_text_document -0.00018535528192944938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0687_text_document -0.0001710147287879884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0688_text_document -0.000173577199328182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0689_text_document -0.0001768576763304655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0690_text_document -0.0001796376911260544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0691_text_document -0.00016782824293218567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0692_text_document -0.00016074324428116396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0693_text_document -0.0001687513348299545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0694_text_document -0.00016077518171436444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0695_text_document -0.00017132313128327624 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0696_text_document -0.00016190186959679132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0697_text_document -0.00016290842504820753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0698_text_document -0.00016156811558387776 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0699_text_document -0.00023555665280084346 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0700_text_document -0.0002284718177796522 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0701_text_document -0.00022374123273516798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0702_text_document -0.00021994541999416394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0703_text_document -0.00022338573100973358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0704_text_document -0.00022049767881647008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0705_text_document -0.00022416439897413284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0706_text_document -0.00021895546198784436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0707_text_document -0.0002142388294097341 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0708_text_document -0.00022017166748084383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0709_text_document -0.00021104350754308596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0710_text_document -0.00020985391201191366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0711_text_document -0.00021778183924550787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0712_text_document -0.00021271266854227129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0713_text_document -0.0002086433619903549 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0714_text_document -0.00021568150697937684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0715_text_document -0.00020764802098217656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0716_text_document -0.0002151205404833473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0717_text_document -0.00020430590834946775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0718_text_document -0.00020866543326050432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0719_text_document -0.00020818319961436583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0720_text_document -0.00020070798626764516 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0721_text_document -0.00019693995826673832 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0722_text_document -0.00020030234076064843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0723_text_document -0.00019788654054706263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0724_text_document -0.0001993257554824347 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0725_text_document -0.00021386012383904914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0726_text_document -0.00021978412787373083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0727_text_document -0.0002175599344895926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0728_text_document -0.00021091594587352813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0729_text_document -0.0002034137316303627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0730_text_document -0.00021253423082914959 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0731_text_document -0.00020816999471172712 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0732_text_document -0.00021853522405647908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0733_text_document -0.0002051944662085363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0734_text_document -0.00020978726975291983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0735_text_document -0.00020468921406556763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0736_text_document -0.0002007846124143192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0737_text_document -0.00020366090300396152 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0738_text_document -0.0001993156168498017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0739_text_document -0.00020150340666889603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0740_text_document -0.00020188286325854645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0741_text_document -0.00020072267667247027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0742_text_document -0.00019591912629771525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0743_text_document -0.00020056463740447396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0744_text_document -0.0001962511050627094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0745_text_document -0.00018969020412060633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0746_text_document -0.00018711981666080213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0747_text_document -0.00019064086480658448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0748_text_document -0.0001893430509717561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0749_text_document -0.00018823938035214858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0750_text_document -0.000191049243153872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0751_text_document -0.00015215085209234548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0752_text_document -0.00013881666461144156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0753_text_document -0.0001511979467407442 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0754_text_document -0.00015091819106548992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0755_text_document -0.00013896830454629422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0756_text_document -0.00014286084497610213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0757_text_document -0.00013829013170563417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0758_text_document -0.00014842506748913496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0759_text_document -0.0001621698681108632 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0760_text_document -0.00025658329333000087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0761_text_document -0.0002625776226522738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0762_text_document -0.00018893904126945972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0763_text_document -0.00019173419836462428 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0764_text_document -0.00024972708669590365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0765_text_document -0.0002565621075859928 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0766_text_document -0.0002548091984702725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0767_text_document -0.00024781120449025493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0768_text_document -0.00024190263274768403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0769_text_document -0.00024935480538538027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0770_text_document -0.00024565807926820224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0771_text_document -0.00024335665926774057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0772_text_document -0.0002407471035651234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0773_text_document -0.00024409063432302957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0774_text_document -0.00025048184051844287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0775_text_document -2.7431736503196682e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_middle-0776_text_document -0.0001542652540558753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0000_text_document -0.0001414689533672357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0001_text_document -0.00014218991553196462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0002_text_document -0.00014380616486339045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0003_text_document -0.00014537826992690233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0004_text_document -0.00015240156803853129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0005_text_document -0.0001508299161037807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0006_text_document -0.0001645724380011881 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0007_text_document -0.0001636434127327491 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0008_text_document -0.0001425695379726649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0009_text_document -0.00015038309042278246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0010_text_document -0.00015551331010771582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0011_text_document -0.00014395190746068794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0012_text_document -0.00014572155617954775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0013_text_document -0.00014985257363654754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0014_text_document -0.00016517178815597176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0015_text_document -0.00015368391453534256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0016_text_document -0.00013802907993189142 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0017_text_document -0.0001438832947332681 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0018_text_document -0.0001453654604013201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0019_text_document -0.00015126685069470999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0020_text_document -0.00014666492015973732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0021_text_document -0.00015372684675786069 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0022_text_document -0.0001466694423156705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0023_text_document -0.00014645983052842166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0024_text_document -0.00014464707855314855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0025_text_document -0.00014224079429035223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0026_text_document -0.00015150561574001976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0027_text_document -0.00014869251464718684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0028_text_document -0.00014975351070572874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0029_text_document -0.00014693519813853656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0030_text_document -0.00015177096878176463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0031_text_document -0.0001541385774188545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0032_text_document -0.00014686140972307025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0033_text_document -0.00014836061485888312 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0034_text_document -0.00015908940031748178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0035_text_document -0.00014335960523511807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0036_text_document -0.00014014336145596836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0037_text_document -0.00014804788542816872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0038_text_document -0.00014447262570766296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0039_text_document -0.0001490836674378867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0040_text_document -0.00015491171627451768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0041_text_document -0.00014704465686983656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0042_text_document -0.00015578029994136968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0043_text_document -0.00014442509556094932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0044_text_document -0.00016401352835433973 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0045_text_document -0.0001426617272165932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0046_text_document -0.00014952006301290383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0047_text_document -0.00014858509055287383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0048_text_document -0.0001452147802800582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0049_text_document -0.00014648995026373163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0050_text_document -0.000150292569067835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0051_text_document -0.00015359505638013499 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0052_text_document -0.00014342220561517732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0053_text_document -0.00015037020981817882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0054_text_document -0.0001442503228598675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0055_text_document -0.00015512168691210362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0056_text_document -0.000141978855262853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0057_text_document -0.0001433238477981227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0058_text_document -0.0001522852753554881 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0059_text_document -0.00015750021259583146 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0060_text_document -0.0001620583984355833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0061_text_document -0.00014425968431250636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0062_text_document -0.00015502607180742606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0063_text_document -0.00014808719854384823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0064_text_document -0.00014037741406088144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0065_text_document -0.00014415351915599912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0066_text_document -0.00014669998038063754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0067_text_document -0.00014168851942590583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0068_text_document -0.00016615444649487683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0069_text_document -0.00017314227247280456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0070_text_document -0.00014511886160872687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0071_text_document -0.0001589885117911034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0072_text_document -0.0001468857466370262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0073_text_document -0.00014409172483178647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0074_text_document -0.00017524066610798787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0075_text_document -0.0001423201779575328 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0076_text_document -0.00014813204150867026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0077_text_document -0.0001426495065609589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0078_text_document -0.00015198519700337085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0079_text_document -0.0001407239353962083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0080_text_document -0.00015564799275992607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0081_text_document -0.00014044706039573722 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0082_text_document -0.00014271692599994692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0083_text_document -0.000145622079855115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0084_text_document -0.0001420329587382314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0085_text_document -0.00014388823447845187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0086_text_document -0.0001386395317413269 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0087_text_document -0.00014615498445222442 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0088_text_document -0.00014100731560794867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0089_text_document -0.0001412468938663676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0090_text_document -0.0001448361986040564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0091_text_document -0.00015041376595655126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0092_text_document -0.00015393889374199827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0093_text_document -0.0001424230223910099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0094_text_document -0.00013832238850082653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0095_text_document -0.00014573052620396468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0096_text_document -0.00014871061906625763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0097_text_document -0.0001474653563212365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0098_text_document -0.00014332440162216428 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0099_text_document -0.00013995360169386805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0100_text_document -0.0001396957447740551 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0101_text_document -0.00014451429874557317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0102_text_document -0.00014667057760559536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0103_text_document -0.00014311302174425863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0104_text_document -0.0001486303888676766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0105_text_document -0.00014984904337848564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0106_text_document -0.00014471364010783683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0107_text_document -0.00014422564733335141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0108_text_document -0.00014833706425660122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0109_text_document -0.0001547519654335586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0110_text_document -0.00016861028196725518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0111_text_document -0.00014655906054430117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0112_text_document -0.00016230445673145143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0113_text_document -0.0001608744287595928 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0114_text_document -0.00014838797263124772 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0115_text_document -0.00013772432541929463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0116_text_document -0.00014338391080519997 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0117_text_document -0.00013969596121954725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0118_text_document -0.00014433977111903384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0119_text_document -0.00013940910504621967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0120_text_document -0.00013841015875212353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0121_text_document -0.00015084460181936482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0122_text_document -0.00015609034169658813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0123_text_document -0.00014131566380676185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0124_text_document -0.0001489310284479002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0125_text_document -0.000142611271970708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0126_text_document -0.00013893968956373896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0127_text_document -0.00014729183194546773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0128_text_document -0.00013844600256987405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0129_text_document -0.00014038359448051134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0130_text_document -0.00014148398954188355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0131_text_document -0.00014453817241187933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0132_text_document -0.00014513564218102443 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0133_text_document -0.00013715869534969562 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0134_text_document -0.00013954027841855143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0135_text_document -0.0001412204761634212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0136_text_document -0.00014007212448120704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0137_text_document -0.00013479800901952131 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0138_text_document -0.00014295404043242684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0139_text_document -0.00013573518591642275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0140_text_document -0.00013489486257742554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0141_text_document -0.00014869208126259815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0142_text_document -0.0001475290332523071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0143_text_document -0.00013460777613768496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0144_text_document -0.00013367855194670696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0145_text_document -0.00014765624643721848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0146_text_document -0.000134270744838921 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0147_text_document -0.0001482262234332188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0148_text_document -0.00013864007544648005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0149_text_document -0.00014100224826604942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0150_text_document -0.00013048509121512907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0151_text_document -0.00012998584056022605 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0152_text_document -0.00013999889926826433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0153_text_document -0.00015896524130927037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0154_text_document -0.00013250422088217822 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0155_text_document -0.00013171712561130657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0156_text_document -0.0001353745598377907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0157_text_document -0.00014385989862913682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0158_text_document -0.0001337953809308385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0159_text_document -0.00013398910556960017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0160_text_document -0.00013559702582181232 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0161_text_document -0.0001479395819777683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0162_text_document -0.00013591800338063272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0163_text_document -0.0001378006151746279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0164_text_document -0.0001348466006623147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0165_text_document -0.0001333909410523815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0166_text_document -0.0001395413623736275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0167_text_document -0.0001340932768114764 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0168_text_document -0.0001401874805500622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0169_text_document -0.00013331732046238236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0170_text_document -0.00013697107190707125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0171_text_document -0.00014902856836260464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0172_text_document -0.0001360841851454116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0173_text_document -0.0001430140955418351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0174_text_document -0.00013769551793105646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0175_text_document -0.00013655894858384475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0176_text_document -0.00013428329448183135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0177_text_document -0.00013659792851661152 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0178_text_document -0.0001390035871359384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0179_text_document -0.00013373343260207954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0180_text_document -0.0001377384027675603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0181_text_document -0.00013688101750180593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0182_text_document -0.00013942483868376482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0183_text_document -0.00013069676947684327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0184_text_document -0.00013248181223347942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0185_text_document -0.00013404120081582244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0186_text_document -0.0001361765930560515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0187_text_document -0.00012895481023244784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0188_text_document -0.0001269948854413741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0189_text_document -0.00012923062571125647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0190_text_document -0.00013146444734116587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0191_text_document -0.00012866221788337398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0192_text_document -0.00012734919091675074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0193_text_document -0.00012491017673902597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0194_text_document -0.00012532940253474304 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0195_text_document -0.00012942822588429847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0196_text_document -0.00012954876208363892 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0197_text_document -0.00012757889363363662 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0198_text_document -0.00012925483823692497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0199_text_document -0.00012887114306702046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0200_text_document -0.00014305235456549959 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0201_text_document -0.00012924991650829868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0202_text_document -0.00013939886744592149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0203_text_document -0.00013473816912159447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0204_text_document -0.00012443870588817695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0205_text_document -0.00012352413384768962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0206_text_document -0.00012363992848397884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0207_text_document -0.00012876521187895858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0208_text_document -0.00012998676310669105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0209_text_document -0.00013573707197851088 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0210_text_document -0.00012914628304832383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0211_text_document -0.00012135846145074816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0212_text_document -0.0001272491158502837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0213_text_document -0.00014048669089899133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0214_text_document -0.00012821863542952837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0215_text_document -0.00012843614908145614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0216_text_document -0.00012566972592748682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0217_text_document -0.00012623965035462757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0218_text_document -0.00012745682281848042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0219_text_document -0.00012684031670531754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0220_text_document -0.00013734922167929733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0221_text_document -0.00012364311692105248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0222_text_document -0.00012150014908859676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0223_text_document -0.00013255947544281956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0224_text_document -0.00013080450775030287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0225_text_document -0.00012642072366799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0226_text_document -0.00012748944981690917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0227_text_document -0.0001272640012288133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0228_text_document -0.00012676436334132504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0229_text_document -0.00011463874381385243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0230_text_document -0.0001259763726722257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0231_text_document -0.00013265355691888996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0232_text_document -0.00012800075083395775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0233_text_document -0.00012600035320386608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0234_text_document -0.00012796669337022373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0235_text_document -0.0001281363666451258 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0236_text_document -0.00013103924202277517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0237_text_document -0.00013710099201804686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0238_text_document -0.00012390494315996567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0239_text_document -0.00012375130141281296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0240_text_document -0.00012654460329615904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0241_text_document -0.00013347917998097572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0242_text_document -0.00012957465780002206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0243_text_document -0.00012365300899833007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0244_text_document -0.00012759104863989702 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0245_text_document -0.00012669826503428652 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0246_text_document -0.00019585621938937627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0247_text_document -0.00017199211271798405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0248_text_document -0.00017151003159557893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0249_text_document -0.0001704765251017538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0250_text_document -0.00016915887010107177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0251_text_document -0.000164507074917777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0252_text_document -0.0001707345009802067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0253_text_document -0.00016235282921392888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0254_text_document -0.0001603312806389334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0255_text_document -0.00016063927887228715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0256_text_document -0.0001682293216120587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0257_text_document -0.00016945118701893779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0258_text_document -0.00016510575549830714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0259_text_document -0.00015878514261762818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0260_text_document -0.00016058925849180358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0261_text_document -0.00016806270202025228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0262_text_document -0.0001601743221175851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0263_text_document -0.00016728028661189246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0264_text_document -0.00016271828570438892 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0265_text_document -0.0001663197659329172 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0266_text_document -0.00016211838369998094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0267_text_document -0.00016174818095722866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0268_text_document -0.00016660916885770873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0269_text_document -0.00016749279166083448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0270_text_document -0.00015990162967327836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0271_text_document -0.00016050019425679443 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0272_text_document -0.00015826664805809287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0273_text_document -0.00015906002765230277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0274_text_document -0.00016496336225309003 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0275_text_document -0.00015969348413764765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0276_text_document -0.00015888249989873604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0277_text_document -0.0001588217905168081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0278_text_document -0.0001579176192128451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0279_text_document -0.0001599592014593771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0280_text_document -0.00015860202306757735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0281_text_document -0.00015475539919197688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0282_text_document -0.0001606154789998261 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0283_text_document -0.00015967691482799697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0284_text_document -0.00015467004809542842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0285_text_document -0.00015681467419158087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0286_text_document -0.0001622263618651377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0287_text_document -0.00016071879902106084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0288_text_document -0.00015926245724996415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0289_text_document -0.00015865169965265541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0290_text_document -0.0001558589009989086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0291_text_document -0.00015834413702510978 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0292_text_document -0.00015984235618630313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0293_text_document -0.00015906347325722462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0294_text_document -0.0001540401129832678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0295_text_document -0.00015709268423517463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0296_text_document -0.00016150611616707217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0297_text_document -0.0001575761279522917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0298_text_document -0.00015145845456253164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0299_text_document -0.00015531545597525365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0300_text_document -0.00015290580088858923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0301_text_document -0.00015077381822016696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0302_text_document -0.00016026706987479596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0303_text_document -0.00015143811781794564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0304_text_document -0.00015335594803302406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0305_text_document -0.00015760769888428818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0306_text_document -0.00016811053178478525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0307_text_document -0.00021456946285616728 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0308_text_document -0.00021300214303968855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0309_text_document -0.00020349194545531642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0310_text_document -0.00021281325399560017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0311_text_document -0.00020973400589848146 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0312_text_document -0.00020126033912157333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0313_text_document -0.00020674507357011296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0314_text_document -0.00021222543863325493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0315_text_document -0.0002050723383820817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0316_text_document -0.00021804813803312056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0317_text_document -0.0002008803314227051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0318_text_document -0.0002150047024098784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0319_text_document -0.00020318723314588857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0320_text_document -0.00020021945595806058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0321_text_document -0.00020351797666608406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0322_text_document -0.00020832621085218548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0323_text_document -0.0002035206051090622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0324_text_document -0.00020272338181805027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0325_text_document -0.00020460676190716195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0326_text_document -0.00020717814792849565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0327_text_document -0.0001955827435950214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0328_text_document -0.00020417807396352577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0329_text_document -0.0002011029760914888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0330_text_document -0.00020292765823625672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0331_text_document -0.00020035339845060027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0332_text_document -0.0002019662525247444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0333_text_document -0.000206838061219021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0334_text_document -0.0001941713531348939 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0335_text_document -0.00010283055875342613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0336_text_document -0.00010052128921034293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0337_text_document -0.00021410992316202177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0338_text_document -0.0002019050315219438 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0339_text_document -0.0001986035353671086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0340_text_document -0.00019334420113344198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0341_text_document -0.0002040134561840194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0342_text_document -0.00019786749210973914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0343_text_document -0.00021292248961774976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0344_text_document -0.000198975254462317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0345_text_document -0.00019270601369753864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0346_text_document -0.0001938662101557011 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0347_text_document -0.0002029610545170106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0348_text_document -0.0002024962737322469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0349_text_document -0.0002075197885043544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0350_text_document -0.00019871985248356538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0351_text_document -0.0001949694696029141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0352_text_document -0.00020180408203543252 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0353_text_document -0.00019545199817763088 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0354_text_document -0.00019734611243298183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0355_text_document -0.00021047242956266074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0356_text_document -0.0001968562822164333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0357_text_document -0.00019972266049102106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0358_text_document -0.00020126122390730825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0359_text_document -0.00019799642896091898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0360_text_document -0.0002021712802087185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0361_text_document -0.0001941903201275054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0362_text_document -0.00019307283352311706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0363_text_document -0.00016598318480128866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0364_text_document -0.00016504803365649659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0365_text_document -0.00016630327313193533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0366_text_document -0.00016601923469884076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0367_text_document -0.0001681694501234557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0368_text_document -0.00016859564709291555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0369_text_document -0.00016845101707974437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0370_text_document -0.0001643037792913447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0371_text_document -0.00016186624765418046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0372_text_document -0.00016697344045101027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0373_text_document -0.00016669715111205908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0374_text_document -0.00016364850623567704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0375_text_document -0.0001634811496926281 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0376_text_document -0.00016825687707295152 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0377_text_document -0.0001627585946667742 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0378_text_document -0.00016582351614544805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0379_text_document -0.0001630893218980273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0380_text_document -0.0001568416151151013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0381_text_document -0.00017273044852059518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0382_text_document -0.00016016530273273665 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0383_text_document -0.00015777742226002822 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0384_text_document -0.00016385370668116144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0385_text_document -0.00016954547679602915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0386_text_document -0.0001676626705219338 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0387_text_document -0.00016250610371947111 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0388_text_document -0.00016004510983519738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0389_text_document -0.000161815220320894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0390_text_document -0.00016744693680716642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0391_text_document -0.00015604191096880147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0392_text_document -0.0001636895622681933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0393_text_document -0.000158886517344257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0394_text_document -0.0001558972054341701 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0395_text_document -0.0001591533045533395 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0396_text_document -0.0001657955386528658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0397_text_document -0.00016060726764524156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0398_text_document -0.00016167923208527019 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0399_text_document -0.00015484394662326808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0400_text_document -0.00016052047349647775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0401_text_document -0.0001582576585363055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0402_text_document -0.0001545777833300399 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0403_text_document -0.00016303475566860345 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0404_text_document -0.0001627904173369714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0405_text_document -0.0001567550665344843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0406_text_document -0.0001587287727580368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0407_text_document -0.0001606889088117574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0408_text_document -0.00016206324217472778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0409_text_document -0.00015712668987045555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0410_text_document -0.0001607143430081059 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0411_text_document -0.00015230600229428526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0412_text_document -0.00016067822548676263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0413_text_document -0.00015993580979768466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0414_text_document -0.00016379843410396262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0415_text_document -0.0001533135627240871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0416_text_document -0.00016861285265852845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0417_text_document -0.0001632799417656467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0418_text_document -0.00015962871905586431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0419_text_document -0.00015014915949133304 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0420_text_document -0.00015059096546324844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0421_text_document -0.00015841934874861877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0422_text_document -0.000152377097357806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0423_text_document -0.00014942797865989248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0424_text_document -0.00015640838403734855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0425_text_document -0.0001557305888039896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0426_text_document -0.00014992907934376868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0427_text_document -0.00015847297170019638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0428_text_document -0.0001563057066889321 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0429_text_document -0.00015425884830587555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0430_text_document -0.00015294599138593887 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0431_text_document -0.00015307387809393826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0432_text_document -0.00016021533866175615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0433_text_document -0.00015819924688246454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0434_text_document -0.00014854336050366525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0435_text_document -0.00015428039783626384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0436_text_document -0.00015380539006369472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0437_text_document -0.00015543551510602353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0438_text_document -0.00015792640857808265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0439_text_document -0.00015591945366146652 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0440_text_document -0.00014809559672766608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0441_text_document -0.00015190843215388426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0442_text_document -0.00014890757113683386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0443_text_document -0.0001610286090290533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0444_text_document -0.00015061787553649923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0445_text_document -0.00014811603935037767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0446_text_document -0.00015254163073097444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0447_text_document -0.00015300211863900935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0448_text_document -0.00015063192030688013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0449_text_document -0.00015300622789493292 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0450_text_document -0.00015096280425750327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0451_text_document -0.00015205454100558358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0452_text_document -0.00015121161958027361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0453_text_document -0.0001493611157597698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0454_text_document -0.00015838957873196607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0455_text_document -0.0001497669779590609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0456_text_document -0.00015173657097785533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0457_text_document -0.0001542516903028995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0458_text_document -0.000149139532833868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0459_text_document -0.00014644441551246194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0460_text_document -0.00015166787754612994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0461_text_document -0.00014923555170687534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0462_text_document -0.00015589324574035403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0463_text_document -0.00015022803227804745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0464_text_document -0.00015127324533861265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0465_text_document -0.00014783676790095657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0466_text_document -0.00014927753645591052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0467_text_document -0.00014753911610765252 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0468_text_document -0.00014886425094132403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0469_text_document -0.00014432622711023067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0470_text_document -0.00015087353567030766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0471_text_document -0.00015318739523991737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0472_text_document -0.00014716603935377532 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0473_text_document -0.00015032310787320853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0474_text_document -0.00014425315738264723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0475_text_document -0.0001507311940067415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0476_text_document -0.0001735562949386336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0477_text_document -0.0001664225151007615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0478_text_document -0.00017016223341338198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0479_text_document -0.0001686337558140661 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0480_text_document -0.00018737654520115072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0481_text_document -0.00016696818282464752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0482_text_document -0.00017542891864931188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0483_text_document -0.000168925038877694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0484_text_document -0.0001769097096293462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0485_text_document -0.00017465563985682533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0486_text_document -0.0001704723163845607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0487_text_document -0.00017113194080906855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0488_text_document -0.00017056770492485763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0489_text_document -0.0001736825492971628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0490_text_document -0.00017060994856935613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0491_text_document -0.00017539355807018588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0492_text_document -0.00017512560274649157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0493_text_document -0.00017536288179601056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0494_text_document -0.00017214679473623093 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0495_text_document -0.00017372473469635212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0496_text_document -0.00016968876198424372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0497_text_document -0.00017328658337078598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0498_text_document -0.00016545006523949998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0499_text_document -0.0001712623636560391 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0500_text_document -0.00017259544872761246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0501_text_document -0.00016731532955664165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0502_text_document -0.00017234554920296389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0503_text_document -0.00016824263782247044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0504_text_document -0.00017046154865322805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0505_text_document -0.00016701775451880861 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0506_text_document -0.0001640723558698162 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0507_text_document -0.00016912021224512063 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0508_text_document -0.00016148128416798815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0509_text_document -0.00017033021559990035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0510_text_document -0.00016742449903581303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0511_text_document -0.00016604941440707502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0512_text_document -0.00016168218680070063 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0513_text_document -0.00016545734985198287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0514_text_document -0.00016617264790719555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0515_text_document -0.00016903898126379064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0516_text_document -0.00016251470403425602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0517_text_document -0.00016741321573477316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0518_text_document -0.00016314387702135404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0519_text_document -0.00016261766224352778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0520_text_document -0.00016043765927930694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0521_text_document -0.0001581188444159775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0522_text_document -0.0001675593630876091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0523_text_document -0.00016225811098829194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0524_text_document -0.00016027854790273813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0525_text_document -0.00015477514040295668 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0526_text_document -0.00016132027735084922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0527_text_document -0.00016144543812901825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0528_text_document -0.00016356924967160763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0529_text_document -0.00016721507926064277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0530_text_document -0.0001623283758093546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0531_text_document -0.00016540060361910116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0532_text_document -0.00016618517731232895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0533_text_document -0.0001661140965633334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0534_text_document -0.00016521134906101744 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0535_text_document -0.0001605250452596446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0536_text_document -0.00016158626615495202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0537_text_document -0.00016348402666537893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0538_text_document -0.00015887094758334445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0539_text_document -0.00016216761850919694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0540_text_document -0.00016125922688833952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0541_text_document -0.00015719662175540762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0542_text_document -0.00016177908132776304 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0543_text_document -0.0001616654955707841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0544_text_document -0.0001575744247706023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0545_text_document -0.00016594502227726776 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0546_text_document -0.00016680360478028852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0547_text_document -0.00016969508752354227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0548_text_document -0.00018702211879271686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0549_text_document -0.00019358085009705273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0550_text_document -0.0001871367897387826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0551_text_document -0.00018452058370522755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0552_text_document -0.0001850164319455863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0553_text_document -0.00018589455402222413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0554_text_document -0.00018848818876445855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0555_text_document -0.00018677441309244695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0556_text_document -0.00018806266359047162 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0557_text_document -0.00018742615490284408 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0558_text_document -0.00018308658912909244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0559_text_document -0.00017917024956722993 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0560_text_document -0.0001796815083811096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0561_text_document -0.00018830762534435366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0562_text_document -0.0001850705756497164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0563_text_document -0.00018620607609678367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0564_text_document -0.00018735293561315315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0565_text_document -0.00018406055855123805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0566_text_document -0.00018296049025592247 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0567_text_document -0.00018407127494772196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0568_text_document -0.0001809459590066732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0569_text_document -0.00018206921683271417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0570_text_document -0.0001823423927624476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0571_text_document -0.00017843504198196598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0572_text_document -0.0001849074668186014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0573_text_document -0.0001812163144813499 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0574_text_document -0.00018309068999374263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0575_text_document -0.00018500613289155086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0576_text_document -0.00017930403632760822 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0577_text_document -0.0001846380543749688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0578_text_document -0.0001805411790348431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0579_text_document -0.00017815258406988848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0580_text_document -0.00017771149209661494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0581_text_document -0.000179212119800064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0582_text_document -0.0001770710081666354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0583_text_document -0.00018076802304233783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0584_text_document -0.00018266780486243524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0585_text_document -0.00017952537023013302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0586_text_document -0.00017482592939671484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0587_text_document -0.00017479307237867526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0588_text_document -0.00017947982239834899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0589_text_document -0.00017800230944457152 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0590_text_document -0.0001768045667273756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0591_text_document -0.00018432659029891628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0592_text_document -0.00017860310980883306 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0593_text_document -0.00017352563618741148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0594_text_document -0.000177967402241009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0595_text_document -0.0001761394507080597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0596_text_document -0.0001727461411889822 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0597_text_document -0.00017520765607261058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0598_text_document -0.00017389963918978602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0599_text_document -0.00017297383567671195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0600_text_document -0.00017186248654837811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0601_text_document -0.00018016764298215066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0602_text_document -0.00017252933018279703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0603_text_document -0.0001720498259217191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0604_text_document -0.00017208910794032673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0605_text_document -0.0001638288329725128 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0606_text_document -0.00015774370365565657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0607_text_document -0.00015428183891406193 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0608_text_document -0.0001579263490987627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0609_text_document -0.00015679781661701012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0610_text_document -0.00015686067490532405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0611_text_document -0.00015476043642401294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0612_text_document -0.0001538144005636655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0613_text_document -0.00015471809257783847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0614_text_document -0.00014950254548936378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0615_text_document -0.00015189343275275787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0616_text_document -0.00016808135779534307 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0617_text_document -0.00015331380459020154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0618_text_document -0.00015025506525877266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0619_text_document -0.00015705079524537657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0620_text_document -0.00014843144411648014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0621_text_document -0.0001536670204340525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0622_text_document -0.00014701650982417206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0623_text_document -0.0001470830903826265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0624_text_document -0.00014669457615379322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0625_text_document -0.00015327731341039172 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0626_text_document -0.00016421071093813112 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0627_text_document -0.00014320086554259857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0628_text_document -0.00014733292080267092 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0629_text_document -0.00014574339323444963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0630_text_document -0.00014508510524362508 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0631_text_document -0.0001510667294376284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0632_text_document -0.00014448955337404646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0633_text_document -0.00015189242851477872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0634_text_document -0.0001408976680729981 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0635_text_document -0.00014495438771487836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0636_text_document -0.00014607129482780071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0637_text_document -0.0001425703250247454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0638_text_document -0.00014772556798043487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0639_text_document -0.0001454755294743558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0640_text_document -0.00014604759342940054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0641_text_document -0.000144987966876031 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0642_text_document -0.00014159362399631978 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0643_text_document -0.00015166107543186514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0644_text_document -0.00013872638536941069 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0645_text_document -0.00014392691133816916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0646_text_document -0.00014527538230304764 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0647_text_document -0.0001445241296159157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0648_text_document -0.00014566980102669863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0649_text_document -0.00014105957349679274 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0650_text_document -0.00014407711883329926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0651_text_document -0.00014304333666146412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0652_text_document -0.00014480474786471068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0653_text_document -0.00014513562095603888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0654_text_document -0.00014216954843071324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0655_text_document -0.0001472056417215835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0656_text_document -0.0001411732545194045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0657_text_document -0.00014472737242668624 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0658_text_document -0.0001412212585262607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0659_text_document -0.00020834639482623596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0660_text_document -0.00019484913874296875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0661_text_document -0.00019400182473285833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0662_text_document -0.000192581173021768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0663_text_document -0.0001958163408499538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0664_text_document -0.00019017201894348343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0665_text_document -0.00018748712836308062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0666_text_document -0.00019398325978096153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0667_text_document -0.00018740362852951608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0668_text_document -0.00018769931256921782 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0669_text_document -0.00018841740417805205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0670_text_document -0.0001897879160564146 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0671_text_document -0.00018663113185306689 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0672_text_document -0.00018894652949372258 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0673_text_document -0.0001929378648272062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0674_text_document -0.00019134942047365448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0675_text_document -0.00018699153383533985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0676_text_document -0.00018610331853766602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0677_text_document -0.0001863160274451902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0678_text_document -0.00018636405144302115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0679_text_document -0.00018489348621678148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0680_text_document -0.0001860176372198307 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0681_text_document -0.00018315031813541827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0682_text_document -0.00019049993633217256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0683_text_document -0.00018374255446481207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0684_text_document -0.00017918235151102646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0685_text_document -0.00018078078222027994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0686_text_document -0.00018377134048126254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0687_text_document -0.00018119048712916442 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0688_text_document -0.00018226290667237163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0689_text_document -0.00018539016766122422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0690_text_document -0.00018304864675259609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0691_text_document -0.00018006283819913595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0692_text_document -0.00017853375396011673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0693_text_document -0.0001806080666151815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0694_text_document -0.00018287085590792935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0695_text_document -0.00018102703894508278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0696_text_document -0.00017985249563069855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0697_text_document -0.00018055111208127884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0698_text_document -0.00017436715651687287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0699_text_document -0.0001750410902836745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0700_text_document -0.0001755658852086883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0701_text_document -0.00017704710809249836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0702_text_document -0.00017563712144304312 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0703_text_document -0.00017646118668991032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0704_text_document -0.0001738273848965312 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0705_text_document -0.00017355052248297015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0706_text_document -0.00017182494917422235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0707_text_document -0.0001796801127149085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0708_text_document -0.0001535678074475219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0709_text_document -0.00016509131806569352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0710_text_document -0.0001660762988129014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0711_text_document -0.00017181117317139103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0712_text_document -0.00016385189811495075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0713_text_document -0.00016321938466065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0714_text_document -0.0001627668114510062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0715_text_document -0.0001667874841569603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0716_text_document -0.0001647336272051215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0717_text_document -0.00015927038206724374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0718_text_document -0.000163069807004626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0719_text_document -0.00016643362662749963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0720_text_document -0.0001598347201275479 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0721_text_document -0.00016414824852047793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0722_text_document -0.00016387374849716915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0723_text_document -0.00016218986007283508 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0724_text_document -0.00016170100645242406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0725_text_document -0.00016794279442600715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0726_text_document -0.00016410407241508566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0727_text_document -0.00016663924614304762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0728_text_document -0.0001610334643678992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0729_text_document -0.00016082817926927476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0730_text_document -0.00016483710320531984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0731_text_document -0.00015950564573034403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0732_text_document -0.00016176598872010603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0733_text_document -0.00016374799045777884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0734_text_document -0.00016207070843359862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0735_text_document -0.000161310121195263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0736_text_document -0.0001590930806312555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0737_text_document -0.00015872700071854542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0738_text_document -0.0001601426608559989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0739_text_document -0.0001592737504230903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0740_text_document -0.0001599609389465664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0741_text_document -0.0001573951015313392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0742_text_document -0.00015918138446881715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0743_text_document -0.00016063409035052854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0744_text_document -0.00015479247307168076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0745_text_document -0.0001590206266750552 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0746_text_document -0.00016413616409963463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0747_text_document -0.00015909403254717725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0748_text_document -0.00015912638065916792 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0749_text_document -0.00015509170291798033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0750_text_document -0.00015668221053756931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0751_text_document -0.00015993661313870757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0752_text_document -0.00015986553041529475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0753_text_document -0.0001551253906720823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0754_text_document -0.0001569044427999477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0755_text_document -0.00015512319487328638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0756_text_document -0.00016021882869106635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0757_text_document -0.00015415106017838012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0758_text_document -0.00015711650631982987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0759_text_document -0.00015512670736159294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0760_text_document -0.00016200410442893923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0761_text_document -0.00015949285619573655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0762_text_document -0.0001625616727060612 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0763_text_document -0.00016316486655764686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0764_text_document -0.0001571167311565954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0765_text_document -0.00016128213234978153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0766_text_document -0.00015535324730882956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0767_text_document -0.0001579934311592013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0768_text_document -0.00015195311864613838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0769_text_document -0.0001615190125670139 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0770_text_document -0.00015867133202388371 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0771_text_document -0.00015932910049616658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0772_text_document -0.00015735730575532447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0773_text_document -0.00016192787415292593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0774_text_document -0.00015443514945271916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0775_text_document -0.00015290872574095856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0776_text_document -0.0001586657525675075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0777_text_document -0.0001561292345081933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0778_text_document -0.0001584146414910674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0779_text_document -0.00015282231142071527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0780_text_document -0.0001561252202711004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0781_text_document -0.00015508367049496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0782_text_document -0.00015211947613405347 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0783_text_document -0.00014976529550875275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0784_text_document -0.00015418186133444713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0785_text_document -0.00015777360151582686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0786_text_document -0.000152640262498424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0787_text_document -0.00015418142572863903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0788_text_document -0.00015502601134089746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0789_text_document -0.00015405733434421877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0790_text_document -0.00015484459497253604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0791_text_document -0.0001541867208689297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0792_text_document -0.00015014404352940876 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0793_text_document -0.00015357544967633106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0794_text_document -0.00015037823631794736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0795_text_document -0.00015025795679285078 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0796_text_document -0.00014876992710553488 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0797_text_document -0.00015032669711698612 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0798_text_document -0.00015596697517010466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0799_text_document -0.00015498394440674378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0800_text_document -0.00014757314272111684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0801_text_document -0.00014919071614611802 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0802_text_document -0.00014686280514246915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0803_text_document -0.00015882771228777683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0804_text_document -0.00014763597756322578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0805_text_document -0.00014785441795725526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0806_text_document -0.00015313024795352964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0807_text_document -0.0001497627986113246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0808_text_document -0.00014499607432690722 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0809_text_document -0.0001461719027401259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0810_text_document -0.00014839933441537366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0811_text_document -0.0001475840995029022 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0812_text_document -0.00015065512711375653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0813_text_document -0.00015285087358760883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0814_text_document -0.00014861957547794477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0815_text_document -0.00014996949492468605 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0816_text_document -0.0001472998668365096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0817_text_document -0.0001464012147691964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0818_text_document -0.00015227635617231567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0819_text_document -0.0001491494017117428 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0820_text_document -0.00014464475787246092 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0821_text_document -0.00014410767861685618 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0822_text_document -0.000144919516791233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0823_text_document -0.00014507990635617585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0824_text_document -0.0001468797342896656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0825_text_document -0.0001422000420712919 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0826_text_document -0.00014228987139298954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0827_text_document -0.00014481016912090385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0828_text_document -0.000142802473797815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0829_text_document -0.00014812295450003065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0830_text_document -0.00014697991622146685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0831_text_document -0.000143946325289488 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0832_text_document -0.0001418544716646782 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0833_text_document -0.00014706985092768576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0834_text_document -0.0001411487598988699 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0835_text_document -0.0001583983550166893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0836_text_document -0.00015370277071378533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0837_text_document -0.0001574284524004961 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0838_text_document -0.00016033599900258183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0839_text_document -0.00016159470012508268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0840_text_document -0.00015624921021983388 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0841_text_document -0.0001603288323615303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0842_text_document -0.00016421653645625842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0843_text_document -0.00016136751182857813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0844_text_document -0.0001644008542307843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0845_text_document -0.00016320230298972016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0846_text_document -0.00016176830866038722 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0847_text_document -0.00015883945834286212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0848_text_document -0.00015854734059433728 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0849_text_document -0.00015424048326372636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0850_text_document -0.00015913631543321879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0851_text_document -0.00016242367155204024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0852_text_document -0.00016352898883564303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0853_text_document -0.00016283852574114027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0854_text_document -0.0001597064012689706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0855_text_document -0.00015723207463854053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0856_text_document -0.00016082454091186785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0857_text_document -0.00015148430437371348 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0858_text_document -0.00015699196205345046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0859_text_document -0.00016323993834433252 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0860_text_document -0.00015419189482936103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0861_text_document -0.00014984592429281824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0862_text_document -0.0001540327550705441 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0863_text_document -0.00015559458082419316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0864_text_document -0.00015809601003355687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0865_text_document -0.00015561437781246056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0866_text_document -0.00015650965510707114 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0867_text_document -0.00015654223175785975 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0868_text_document -0.00015966194232830576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0869_text_document -0.0001542791440813034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0870_text_document -0.00016358133853488976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0871_text_document -0.0001610108148402946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0872_text_document -0.0001567861463301872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0873_text_document -0.00015916579076809533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0874_text_document -0.00015834187212170972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0875_text_document -0.00015492852942470005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0876_text_document -0.0001565761307746086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0877_text_document -0.00016111787860345758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0878_text_document -0.00015262185821473176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0879_text_document -0.00015609313599061615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0880_text_document -0.00015265109415151545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0881_text_document -0.00015596676711588585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0882_text_document -0.00015602244000618423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0883_text_document -0.00015533087814847594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0884_text_document -0.000148761688602713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0885_text_document -0.00015124065708812265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0886_text_document -0.00015177148904071277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0887_text_document -0.00015551510213818192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0888_text_document -0.00015328016792414618 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0889_text_document -0.00014826652573194586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0890_text_document -0.00015618973632950672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0891_text_document -0.00016465597460827412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0892_text_document -0.00017729797829003265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0893_text_document -0.00017645710877786075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0894_text_document -0.000173993320599559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0895_text_document -0.0001752697954262395 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0896_text_document -0.00017545831920313468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0897_text_document -0.00017512052874093406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0898_text_document -0.00017596295211949001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0899_text_document -0.0001763343681416489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0900_text_document -0.00016737628055788186 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0901_text_document -0.00017659674006013248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0902_text_document -0.00017521085067973818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0903_text_document -0.00018110203496350606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0904_text_document -0.00016887408015540739 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0905_text_document -0.0001730418383091983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0906_text_document -0.00017084812178309202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0907_text_document -0.00016928946570955264 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0908_text_document -0.00017272373105947043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0909_text_document -0.00016793546933797045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0910_text_document -0.00016510473373737477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0911_text_document -0.0001656625036518595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0912_text_document -0.00016849674877913583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0913_text_document -0.00017492155042464418 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0914_text_document -0.00017092357710033054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0915_text_document -0.00016970730743877006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0916_text_document -0.00016573665091766286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0917_text_document -0.00016358480536479716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0918_text_document -0.0001653802811890403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0919_text_document -0.00017231807148475074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0920_text_document -0.00017361608596973323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0921_text_document -0.00017404933358323055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0922_text_document -0.00016371945617952907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0923_text_document -0.00017000836658266155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0924_text_document -0.00017142976487027857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0925_text_document -0.00017006281434704977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0926_text_document -0.0001751965302313473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0927_text_document -0.00016954848753554936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0928_text_document -0.0001683555446267139 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0929_text_document -0.00016921278107076727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0930_text_document -0.00016808682594394623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0931_text_document -0.00017711704047105475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0932_text_document -0.0001675247295876393 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0933_text_document -0.00017061773073498863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0934_text_document -0.0001644856648306077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0935_text_document -0.00016530682645009105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0936_text_document -0.00016993430076157017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0937_text_document -0.00016716870217360928 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0938_text_document -0.0001672477045314564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0939_text_document -0.00016150529456268964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0940_text_document -0.0001642955368396883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0941_text_document -0.0001650135010986092 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0942_text_document -0.0001719916971031507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0943_text_document -0.0001663860254017646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0944_text_document -0.00016810785027934324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0945_text_document -0.00016663511368772123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0946_text_document -0.00017120237493641126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0947_text_document -0.0001651698100366788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0948_text_document -0.00016069571413445028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0949_text_document -0.0001631772602215936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0950_text_document -0.00016994484266892867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0951_text_document -0.00016821930169126347 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0952_text_document -0.0001680542144940534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0953_text_document -0.00015807234911071054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0954_text_document -0.00016287290799651364 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0955_text_document -0.00016674360421415713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0956_text_document -0.0001663549971877126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0957_text_document -0.0001699417467826641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0958_text_document -0.0001661066433849769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0959_text_document -0.00016736976350010906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0960_text_document -0.00016160049405253383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0961_text_document -0.0001625500850979611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0962_text_document -0.00016172349111618741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0963_text_document -0.00016041582790085466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0964_text_document -0.00016369413378455798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0965_text_document -0.00016245798272839223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0966_text_document -0.00016458727969573578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0967_text_document -0.0001618972714257936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0968_text_document -0.00016149423535800886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0969_text_document -0.00015886933917368354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0970_text_document -0.00015721961433801126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0971_text_document -0.00015609496997744904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0972_text_document -0.0001608435755282705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0973_text_document -0.00015730100598754584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0974_text_document -0.00015955845719642757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0975_text_document -0.00015469663090901824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0976_text_document -0.00015812452037199733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0977_text_document -0.00015443940925795885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0978_text_document -0.00015678701926941855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0979_text_document -0.00015787925332384637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0980_text_document -0.00015669644312439214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0981_text_document -0.00015342587917756964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0982_text_document -0.00015642024238741553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0983_text_document -0.0001540823378708023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0984_text_document -0.00015238224416999995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0985_text_document -0.0001522695061784323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0986_text_document -0.00020085620305657233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0987_text_document -0.00014698197479826313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0988_text_document -0.00014796924883111914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0989_text_document -0.0001483800966807953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0990_text_document -0.00014550940307048242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0991_text_document -0.00015052597307667803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0992_text_document -0.00014866583878918362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0993_text_document -0.00014440801314961302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0994_text_document -0.00014295564464645108 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0995_text_document -0.00014903049761507035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0996_text_document -0.00014820091066353183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0997_text_document -0.0001429454882440627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0998_text_document -0.00015048550764172483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-0999_text_document -0.0001430543312039796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1000_text_document -0.00014661342883839465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1001_text_document -0.00014721354013103223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1002_text_document -0.00014780017824708586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1003_text_document -0.0001463184859455721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1004_text_document -0.00014654870719379106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1005_text_document -0.00020943212095457075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1006_text_document -0.00021205821955900777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1007_text_document -0.00014176730212983274 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1008_text_document -0.00014026276433980122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1009_text_document -0.00013570196535880505 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1010_text_document -0.00014776685378575983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1011_text_document -0.00014138218982193943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1012_text_document -0.0001412602382122253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1013_text_document -0.00013944232659104602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1014_text_document -0.00014570617769030735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1015_text_document -0.00014233071172042007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1016_text_document -0.00014016762901851798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1017_text_document -0.0001434413757259645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1018_text_document -0.00014003324697133565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1019_text_document -0.00014567282904236987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1020_text_document -0.00013992559507863123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1021_text_document -0.00021096883039305026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1022_text_document -0.00014274603730164107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1023_text_document -0.00013914595792215918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1024_text_document -0.00013666688380542608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1025_text_document -0.00014001152690065646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1026_text_document -0.00021392615254787925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1027_text_document -0.00014251166508793392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1028_text_document -0.00013886942449587415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1029_text_document -0.0002078004025575127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1030_text_document -0.00020928673622040174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1031_text_document -0.00020558733131260538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1032_text_document -0.0002036663760886078 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1033_text_document -0.00014592860566679667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1034_text_document -0.00014346325128200297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1035_text_document -0.00014068142446497316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1036_text_document -0.000142996292961803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1037_text_document -0.00020633185839414136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1038_text_document -0.00013684538988274547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1039_text_document -0.0002033768324865864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1040_text_document -0.000200593087523188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1041_text_document -0.0002297294147093001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1042_text_document -0.00022971372080690233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1043_text_document -0.00023092966691083417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1044_text_document -0.00015159247973379415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1045_text_document -0.00015257723761865372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1046_text_document -0.00015750287090187065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1047_text_document -0.00015557071949799488 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1048_text_document -0.00015138603787345713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1049_text_document -0.00014966823068820163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1050_text_document -0.00015481393029806212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1051_text_document -0.0001521335747073047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1052_text_document -0.00015447866363472483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1053_text_document -0.0001564823000495303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1054_text_document -0.00015484698673224505 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1055_text_document -0.00022305811126444646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1056_text_document -0.00015308102523761935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1057_text_document -0.00022494528198789627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1058_text_document -0.0002206911435725598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1059_text_document -0.00021440132246946592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1060_text_document -0.00014934935094772055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1061_text_document -0.00015275047150828305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1062_text_document -0.00021692931968428998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1063_text_document -0.00023057843831795596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1064_text_document -0.00022061661869945533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1065_text_document -0.0001475889972917192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1066_text_document -0.00014965255899799802 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1067_text_document -0.000146325773766483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1068_text_document -0.00021849119850040293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1069_text_document -0.00021649545481859658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1070_text_document -0.00014463616989778393 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1071_text_document -0.00014301572221485565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1072_text_document -0.00014804643324427358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1073_text_document -0.0002143783669071859 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1074_text_document -0.0001479303814401362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1075_text_document -0.00015068744684349907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1076_text_document -0.00021658806091136903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1077_text_document -0.00021333945668012075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1078_text_document -0.000142221472149436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1079_text_document -0.0002158096794842747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1080_text_document -0.00021541031163695796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1081_text_document -0.0002160301031804424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1082_text_document -0.00014484879119054217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1083_text_document -0.00014717950537309672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1084_text_document -0.00021016132927298846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1085_text_document -0.00021433713539833563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1086_text_document -0.0001438233936284062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1087_text_document -0.0001447086593934949 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1088_text_document -0.00021440017582664183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1089_text_document -0.00020841624205804798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1090_text_document -0.000213227136771408 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1091_text_document -0.00020931414236598925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1092_text_document -0.0002134545412666026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1093_text_document -0.0002126803251195216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1094_text_document -0.00014114550507201583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1095_text_document -0.00016444080384922814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1096_text_document -0.0001542515002652382 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1097_text_document -0.0001608177523717217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1098_text_document -0.0001577693965006662 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1099_text_document -0.0001615213258436368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1100_text_document -0.00014975169893108998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1101_text_document -0.00015902857074290308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1102_text_document -0.00015523901418979132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1103_text_document -0.00015842052994374488 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1104_text_document -0.0001543439686424067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1105_text_document -0.0001559141331005536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1106_text_document -0.0001558557495821586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1107_text_document -0.00016108187362389814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1108_text_document -0.0001605357063724452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1109_text_document -0.0001588416921491903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1110_text_document -0.00015452564563384654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1111_text_document -0.0001575925464658241 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1112_text_document -0.000155416389913229 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1113_text_document -0.00015834897089216795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1114_text_document -0.00015376802717866433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1115_text_document -0.00015257616131444455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1116_text_document -0.00015333466381495513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1117_text_document -0.00015356006723825613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1118_text_document -0.00015392513748333956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1119_text_document -0.00015808193589371923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1120_text_document -0.00015572715307115401 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1121_text_document -0.00015677288071421776 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1123_text_document -0.00015564703516755468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1124_text_document -0.00015473730933423342 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1125_text_document -0.00015227152970932222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1126_text_document -0.00015062363935408713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1127_text_document -0.0001608838990519831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1128_text_document -0.00016058746991656767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1129_text_document -0.00015232158785053588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1130_text_document -0.00015216796930278597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1131_text_document -0.00015531087359959403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1132_text_document -0.00017455174602057423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1133_text_document -0.00015220395996782025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1134_text_document -0.00022536045257736233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1135_text_document -0.00023391977994072452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1136_text_document -0.00022316737354122904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1137_text_document -0.00023097409031198833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1138_text_document -0.0001536444602488289 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1139_text_document -0.00015290857223001657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1140_text_document -0.00015053717764782956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1141_text_document -0.0001487906308449292 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1142_text_document -0.00022796481136694752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1143_text_document -0.00022388054021300896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1144_text_document -0.00015633876287631285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1146_text_document -0.00015683128496399404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1147_text_document -0.0001498588984354396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1148_text_document -0.00015466674094651695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1149_text_document -0.00015104328866230663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1150_text_document -0.0001510288850415886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1151_text_document -0.00015453329995596143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1152_text_document -0.0001717890160140908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1153_text_document -0.00016303689223488152 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1154_text_document -0.00017438742884609578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1155_text_document -0.00017195307231868866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1156_text_document -0.00016630614911747752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1157_text_document -0.0001738954845222655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1158_text_document -0.00016759158755171884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1159_text_document -0.00017061259922452842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1160_text_document -0.00017196072417278202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1161_text_document -0.00016824585118656202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1162_text_document -0.00016301309236242047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1163_text_document -0.0001718575393991296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1164_text_document -0.00017003663826341565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1165_text_document -0.00017018328983305946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1166_text_document -0.00017218141861091656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1167_text_document -0.00016559619112054818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1168_text_document -0.00016284882257395627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1169_text_document -0.0001617104078870124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1170_text_document -0.00016849349395228177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1171_text_document -0.00016378319727916067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1172_text_document -0.00017114019486042634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1173_text_document -0.0001726823065513329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1174_text_document -0.00016244897469644304 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1175_text_document -0.0001613681046473606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1176_text_document -0.00018118661924575096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1177_text_document -0.00016563345750593493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1178_text_document -0.00016790014898759615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1179_text_document -0.0001629142142864177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1180_text_document -0.00016191717527939525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1181_text_document -0.0001671004065869619 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1182_text_document -0.0001675370141650324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1183_text_document -0.00016799445480682778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1184_text_document -0.0001719736620354862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1185_text_document -0.00016261057260474936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1186_text_document -0.00015865991174764644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1187_text_document -0.00015739800441831657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1188_text_document -0.00016171134746282626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1189_text_document -0.00016720238820009615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1190_text_document -0.00016497201020069133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1191_text_document -0.00016081080933342493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1192_text_document -0.0001598451415954535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1193_text_document -0.00016189725587725768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1194_text_document -0.00015376149407875128 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1195_text_document -0.00015923032632387212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1196_text_document -0.000161420662154024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1197_text_document -0.00015926844960634996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1198_text_document -0.000156372807999939 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1199_text_document -0.00016050285429044874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1200_text_document -0.00015617925982472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1201_text_document -0.00016514079794945202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1202_text_document -0.00016522274070820443 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1203_text_document -0.0001597381170738336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1204_text_document -0.0001616744058690789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1205_text_document -0.00016029435854255644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1206_text_document -0.0001600416279503584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1207_text_document -0.0001607379715998696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1208_text_document -0.0001593514911283079 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1209_text_document -0.00015864317782095664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1210_text_document -0.00015911735436385907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1211_text_document -0.0001556275795066712 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1212_text_document -0.0001656764173702947 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1213_text_document -0.00015679155524627255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1214_text_document -0.00016376988600479205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1215_text_document -0.0001581538165285075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1216_text_document -0.0001610240227045592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1217_text_document -0.00015776131940645536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1218_text_document -0.00015818231748846595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1219_text_document -0.0001625550897521123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1220_text_document -0.0001547371099180901 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1221_text_document -0.00015414283944531357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1222_text_document -0.00016266088273096592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1223_text_document -0.00016083169545961368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1224_text_document -0.0001573027086756309 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1225_text_document -0.00015728313997935927 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1226_text_document -0.00016781226249248295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1227_text_document -0.00014976228995207784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1228_text_document -0.00015444629923379175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1229_text_document -0.00015203154472094758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1230_text_document -0.00015416974359531256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1231_text_document -0.00015545110214308707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1232_text_document -0.0001510309557116906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1233_text_document -0.000150151986610048 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1234_text_document -0.00014833490597173326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1235_text_document -0.00014730918386476007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1236_text_document -0.00014903663558472915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1237_text_document -0.00014834903218682616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1238_text_document -0.00015322537809196756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1239_text_document -0.0001511230642513134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1240_text_document -0.00015357591909403477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1241_text_document -0.00015295542934724653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1242_text_document -0.00015013958035919124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1243_text_document -0.00015023610122778707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1244_text_document -0.00014784318253583398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1245_text_document -0.00015065966876706016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1246_text_document -0.0001481405433493943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1247_text_document -0.00014721741369089534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1248_text_document -0.00014730057861393202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1249_text_document -0.00015235999841072513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1250_text_document -0.00014541040677624616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1251_text_document -0.00014639042630648248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1252_text_document -0.00015068532335773535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1253_text_document -0.00015516053357170532 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1254_text_document -0.00014515004876336832 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1255_text_document -0.0001488593805475465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1256_text_document -0.0001506759742452044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1257_text_document -0.0001429840653957083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1258_text_document -0.00014437998012654534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1259_text_document -0.0001428860592717282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1260_text_document -0.0001475220383855572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1261_text_document -0.00014640582972274082 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1262_text_document -0.0001505350968588391 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1263_text_document -0.00014784485165882563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1264_text_document -0.00014770697193146622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1265_text_document -0.0001433464625266231 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1266_text_document -0.00014139730694769496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1267_text_document -0.00014139435371307747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1268_text_document -0.00014164383589527758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1269_text_document -0.0001429075740030123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1270_text_document -0.00014605872692153072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1271_text_document -0.0001424796215298057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1272_text_document -0.00014112515203848743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1273_text_document -0.00014039188160335826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1274_text_document -0.00014502736267043328 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1275_text_document -0.00014184146815260007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1276_text_document -0.0001453216584479987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1277_text_document -0.00014226985746562565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1278_text_document -0.00013903471234323833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1279_text_document -0.00014633669945119654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1280_text_document -0.00015567823959834718 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1281_text_document -0.00016711998145328748 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1282_text_document -0.00016716820782888765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1283_text_document -0.00016788189624042867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1284_text_document -0.00016762149528397544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1285_text_document -0.00016394982452183396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1286_text_document -0.00017499487929449305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1287_text_document -0.00017285598246362648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1288_text_document -0.0001813127546456402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1289_text_document -0.00016923644001919636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1290_text_document -0.00016671545149204298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1291_text_document -0.0001691584149978932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1292_text_document -0.00016279240063910965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1293_text_document -0.00016581675179191334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1294_text_document -0.00016709742151486606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1295_text_document -0.00016462921631835026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1296_text_document -0.0001635773235573904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1297_text_document -0.0001629499633321397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1298_text_document -0.00016244603775076793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1299_text_document -0.00016565874682941692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1300_text_document -0.00016704769334813707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1301_text_document -0.00016527793060668047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1302_text_document -0.0001614670182628741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1303_text_document -0.00016090321773766912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1304_text_document -0.00016205158644923216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1305_text_document -0.00016115649647745916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1306_text_document -0.00016750884342636079 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1307_text_document -0.0001593023982303325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1308_text_document -0.00015894512446540672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1309_text_document -0.00016391499925658774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1310_text_document -0.0001615310219600013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1311_text_document -0.00016109142610140696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1312_text_document -0.0001622135071747606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1313_text_document -0.00016686311075489617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1314_text_document -0.00016322992039795453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1315_text_document -0.00015923727775344227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1316_text_document -0.00016528070219491 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1317_text_document -0.00016089805290891765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1318_text_document -0.00016142731643379644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1319_text_document -0.00016164621217780662 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1320_text_document -0.00015738061325748116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1321_text_document -0.0001591233926254462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1322_text_document -0.00016649327648776514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1323_text_document -0.00016299925243783037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1324_text_document -0.00016490993699004063 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1325_text_document -0.0001589061309585213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1326_text_document -0.00015701373074415468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1327_text_document -0.00015755460137450403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1328_text_document -0.00016368403834230255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1329_text_document -0.0001619141257919363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1330_text_document -0.0002274793692927606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1331_text_document -0.0001567633247814788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1332_text_document -0.00022905033511751312 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1333_text_document -0.0001548301064518758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1334_text_document -0.000226605319945327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1335_text_document -0.00022667037674726058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1336_text_document -0.00022923961805784498 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1337_text_document -0.00014906828549341607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1338_text_document -0.00015829222539969273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1339_text_document -0.0001509036911919305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1340_text_document -0.00022536653378252486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1341_text_document -0.00015104016760222197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1342_text_document -0.00015099364342110257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1343_text_document -0.00022777331115603203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1344_text_document -0.00021580582739619934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1345_text_document -0.0001492017484493636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1346_text_document -0.0002232038326367584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1347_text_document -0.0002173110715340058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1348_text_document -0.0002106853410947563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1349_text_document -0.00021523392953900664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1350_text_document -0.00021996424976477582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1351_text_document -0.00021735745725911482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1352_text_document -0.00014743618479981591 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1353_text_document -0.00021587099328468655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1354_text_document -0.00021669175360386172 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1355_text_document -0.00021667379282364665 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1356_text_document -0.0002192120523189847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1357_text_document -0.00021547193097844086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1358_text_document -0.00021621049112421326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1359_text_document -0.00021196265801039842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1360_text_document -0.00021115416894129982 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1361_text_document -0.00021548122875612305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1362_text_document -0.0002167839127379268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1363_text_document -0.00021388435981092266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1364_text_document -0.00021247309275187394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1365_text_document -0.00020865156988970925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1366_text_document -0.00021232420243985875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1367_text_document -0.00020288941772275403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1368_text_document -0.00020534370920083462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1369_text_document -0.00014906807620518648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1370_text_document -0.0002110153701227056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1371_text_document -0.00020709542453451886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1372_text_document -0.00020465988557797482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1373_text_document -0.000195974694790701 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1374_text_document -0.0002006410964660873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1375_text_document -0.00020083864604468702 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1376_text_document -0.00020640909562295756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1377_text_document -0.0002009390668809768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1378_text_document -0.00019660322090934407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1379_text_document -0.0002031382964736789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1380_text_document -0.00019629671755665872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1381_text_document -0.00019754174238439996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1382_text_document -0.0002056909946356413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1383_text_document -0.0001979138566098626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1384_text_document -0.0001932131948461709 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1385_text_document -0.00020416546879013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1386_text_document -0.00020460391232945065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1387_text_document -0.00019389888059130955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1388_text_document -0.00019783854863351214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1389_text_document -0.000200961415063147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1390_text_document -0.0001956818423121531 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1391_text_document -0.00020637040765714317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1392_text_document -0.00020119793791085526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1393_text_document -0.0002019159752232148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1394_text_document -0.00020709690510066213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1395_text_document -0.00019733093804912572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1396_text_document -0.0001880608678579731 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1397_text_document -0.0002016375431479316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1398_text_document -0.00019179791527764437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1399_text_document -0.00018506553224762644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1400_text_document -0.00019958850500821938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1401_text_document -0.0001963985599733761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1402_text_document -0.00019686962952391687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1403_text_document -0.00019466431453041557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1404_text_document -0.00019423474723069192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1405_text_document -0.00018645004940802463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1406_text_document -0.0001957563417646353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1407_text_document -0.00019567310057973193 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1408_text_document -0.00019820964060443815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1409_text_document -0.0001922448994056278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1410_text_document -0.00018809380854194413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1411_text_document -0.00019183325882742152 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1412_text_document -0.00018979529371331087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1413_text_document -0.00018194205843788177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1414_text_document -0.000185326810832552 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1415_text_document -0.00018768967790659056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1416_text_document -8.238038512980449e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1417_text_document -7.037628876350043e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1418_text_document -7.031761895460266e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1419_text_document -6.852561440270574e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1420_text_document -7.163053214543125e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1421_text_document -6.965337217248569e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1422_text_document -7.217926984135532e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1423_text_document -6.887448282655111e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1424_text_document -7.065036798913058e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1425_text_document -7.00112034634854e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1426_text_document -7.22545398101735e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1427_text_document -7.06581990215903e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1428_text_document -8.704526082745054e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1429_text_document -7.647895905010174e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1430_text_document -6.667599117230014e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1431_text_document -9.175439580281598e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1432_text_document -6.936196694178977e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1433_text_document -6.994579003243415e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1434_text_document -6.85501978720171e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1435_text_document -6.733846418731063e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1436_text_document -6.760126406073544e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1437_text_document -6.979877393600358e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1438_text_document -6.866399513844505e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1439_text_document -6.599386727589954e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1440_text_document -7.022110351565428e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1441_text_document -6.889110495186351e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1442_text_document -7.249533430962498e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1443_text_document -7.061312850517899e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1444_text_document -7.026495137417699e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1445_text_document -7.053710208774785e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1446_text_document -7.079302654666706e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1447_text_document -7.142821385554296e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1448_text_document -6.884074447800683e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1449_text_document -6.775299728680366e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1450_text_document -6.935640081273007e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1451_text_document -7.071164131398859e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1452_text_document -7.251697614402021e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1453_text_document -0.00012391766284956256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1454_text_document -6.876051279861284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1455_text_document -7.000563116437178e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1456_text_document -7.021430732464126e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1457_text_document -7.320305084935923e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1458_text_document -6.93854906426365e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1459_text_document -7.268065730933861e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1460_text_document -0.00015861223006440801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1461_text_document -0.00018316051674097559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1462_text_document -0.00018895109829526356 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1463_text_document -0.0001814266629730391 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1464_text_document -0.00019157095403912478 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1465_text_document -0.0001816102282477865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1466_text_document -0.00018024098024327291 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1467_text_document -0.00018881439877582162 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1468_text_document -0.00018219752655961166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1469_text_document -0.00018211134990984607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1470_text_document -0.00018153994220173833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1471_text_document -0.00018033319169939537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1472_text_document -0.00017832883141386175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1473_text_document -0.00018126942359083546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1474_text_document -0.0001780495979719729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1475_text_document -0.00017938729404825616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1476_text_document -0.00017959022144586935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1477_text_document -0.00017639465376427234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1478_text_document -0.00017486395149030808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1479_text_document -0.0001808240528085561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1480_text_document -0.00017274905690967175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1481_text_document -0.00017486036693577152 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1482_text_document -0.00017698269026135074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1483_text_document -0.00016841434450034874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1484_text_document -0.00016863721573351308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1485_text_document -0.00017483420640067329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1486_text_document -0.0001732828833128397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1487_text_document -0.00017295290907496933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1488_text_document -0.00017133232046800912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1489_text_document -0.00016977827391836668 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1490_text_document -0.0001740935435005184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1491_text_document -3.6341265268857285e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/cc_en_tail-1492_text_document diff --git a/ALCF/data-lists/sunspot/data_file_list_peS2o.txt b/ALCF/data-lists/sunspot/data_file_list_peS2o.txt deleted file mode 100644 index dff1249cf5..0000000000 --- a/ALCF/data-lists/sunspot/data_file_list_peS2o.txt +++ /dev/null @@ -1,26 +0,0 @@ -0.001258392312111664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0000_text_document -0.0012624388132337304 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0001_text_document -0.0012626279540316713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0002_text_document -0.0012611745647392154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0003_text_document -0.001261744080471196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0004_text_document -0.0012540395730196387 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0005_text_document -0.0012674794155994474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0006_text_document -0.0015181894311854882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0007_text_document -0.003353431842116585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0008_text_document -0.0033457523561418873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0009_text_document -0.0035360737173355393 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0010_text_document -0.0035328021064248917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0011_text_document -0.003537853575841124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0012_text_document -0.0033495442948704096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0013_text_document -0.0033515559656802623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0014_text_document -0.0035292441286648877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0015_text_document -0.0033472466636064995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0016_text_document -0.003347244907254542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0017_text_document -0.003361109976122766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0018_text_document -0.003527949940706846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0019_text_document -0.0033629959027952918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0020_text_document -0.003534363177394335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0021_text_document -0.0033534091101340303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0022_text_document -0.003362863367631581 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0023_text_document -0.0035295619700253587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0024_text_document -0.00039091085286111746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/pes2o_v2-0025_text_document diff --git a/ALCF/data-lists/sunspot/data_file_list_reddit.txt b/ALCF/data-lists/sunspot/data_file_list_reddit.txt deleted file mode 100644 index 644d717021..0000000000 --- a/ALCF/data-lists/sunspot/data_file_list_reddit.txt +++ /dev/null @@ -1,78 +0,0 @@ -0.0011541728836721287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0000_text_document -0.001194214065746794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0001_text_document -0.0012074645870644872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0002_text_document -0.0011669676257397446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0003_text_document -0.0011730429598479002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0004_text_document -0.0011829645036126979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0005_text_document -0.0011882713613863669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0006_text_document -0.0011409601969657492 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0007_text_document -0.0011370779956530767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0008_text_document -0.0011566277261230336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0009_text_document -0.0011377530435595722 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0010_text_document -0.0011422212106036002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0011_text_document -0.0011359648236479313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0012_text_document -0.0011422819300771266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0013_text_document -0.0011613422088431185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0014_text_document -0.001149222546698594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0015_text_document -0.0011520428345756523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0016_text_document -0.0011408015787470732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0017_text_document -0.001145413257179254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0018_text_document -0.0011543340882314167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0019_text_document -0.0011397083750923865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0020_text_document -0.001163788652940794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0021_text_document -0.0011441686420414542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0022_text_document -0.0011429505546541332 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0023_text_document -0.00117471168582067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0024_text_document -0.0011456585273133617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0025_text_document -0.0011738665177335344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0026_text_document -0.0011646176186295262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0027_text_document -0.0011629386473461694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0028_text_document -0.0011421097688385183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0029_text_document -0.0011459477142114253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0030_text_document -0.0011756431096178663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0031_text_document -0.0011482680809577622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0032_text_document -0.0011445710176100962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0033_text_document -0.001142534803152167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0034_text_document -0.0011422043218494292 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0035_text_document -0.0011678344410475695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0036_text_document -0.0011562147470581413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0037_text_document -0.0011468122833549663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0038_text_document -0.0011532706690152916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0039_text_document -0.0011292882378850658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0040_text_document -0.0011300177059999066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0041_text_document -0.0011287171558685828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0042_text_document -0.0011295841562723513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0043_text_document -0.0011279954847952854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0044_text_document -0.0011283817109930107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0045_text_document -0.001128286479630481 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0046_text_document -0.0011276081740353844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0047_text_document -0.0011268985652144736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0048_text_document -0.0011261863340342809 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0049_text_document -0.0011248860240274238 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0050_text_document -0.0011253794147731645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0051_text_document -0.0011242857628861397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0052_text_document -0.0011228472942657042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0053_text_document -0.00112269047698053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0054_text_document -0.0011234938283922757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0055_text_document -0.0011230927745087202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0056_text_document -0.0011247141749506225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0057_text_document -0.0011241207913742775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0058_text_document -0.0011220187728072355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0059_text_document -0.0011227320045060405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0060_text_document -0.0011217839100677303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0061_text_document -0.0011210875921360617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0062_text_document -0.0011221651716921029 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0063_text_document -0.0011248396609954611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0064_text_document -0.0012275703827670792 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0065_text_document -0.0011056036331311184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0066_text_document -0.001107902944963784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0067_text_document -0.0010968114497626087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0068_text_document -0.0011027306309299484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0069_text_document -0.0010853624892717291 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0070_text_document -0.0011051858405711837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0071_text_document -0.0010808015771539223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0072_text_document -0.0010855928806935572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0073_text_document -0.0010442141182932184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0074_text_document -0.0011804749731815143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0075_text_document -0.0011670805522744465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0076_text_document -0.0008366052616529944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/reddit-v5-dedupe-pii-nsfw-toxic-0077_text_document diff --git a/ALCF/data-lists/sunspot/data_file_list_stack.txt b/ALCF/data-lists/sunspot/data_file_list_stack.txt deleted file mode 100644 index cbaf3cedde..0000000000 --- a/ALCF/data-lists/sunspot/data_file_list_stack.txt +++ /dev/null @@ -1,149 +0,0 @@ -0.0010659025986423038 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0000_text_document -0.001089820700651703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0001_text_document -0.0010894690468995446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0002_text_document -0.0010893103153582777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0003_text_document -0.001092968830569157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0004_text_document -0.0010927822953669655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0005_text_document -0.0010948538530423937 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0006_text_document -0.0010914947459084862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0007_text_document -0.0011531345061061805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0008_text_document -0.0009273732822541429 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0009_text_document -0.0009298094568342398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0010_text_document -0.0009269985376241653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0011_text_document -0.0009299414467502114 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0012_text_document -0.0009281292496915194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0013_text_document -0.0009300797305068478 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0014_text_document -0.0009575658299825903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0015_text_document -0.001124706364232967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0016_text_document -0.0011201757618238954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0017_text_document -0.001126433347327465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0018_text_document -0.0011299837668245817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0019_text_document -0.001127851225271931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0020_text_document -0.0011265589698280143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0021_text_document -0.0011227970380980016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0022_text_document -0.001131300918127052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0023_text_document -0.00112588381546472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0024_text_document -0.0011692456277892793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0025_text_document -0.0011330744556493294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0026_text_document -0.001041946972706877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0027_text_document -0.0010493121881969634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0028_text_document -0.0009912570469629923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0029_text_document -0.0012717963903526445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0030_text_document -0.0014051955824199262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0031_text_document -0.0011248653480876683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0032_text_document -0.0015096975127629315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0033_text_document -0.001056885183600456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0034_text_document -0.0010523010671513575 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0035_text_document -0.001055691055690255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0036_text_document -0.0012434898779499373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0037_text_document -0.0009615620261395163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0038_text_document -0.0011689290747945063 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0039_text_document -0.0012610288149681123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0040_text_document -0.0012183045747008489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0041_text_document -0.0012232394891956877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0042_text_document -0.0012316862572191265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0043_text_document -0.001171858466558184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0044_text_document -0.0009288715082322405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0045_text_document -0.0009096255640660796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0046_text_document -0.0009098493089021282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0047_text_document -0.000908428701094243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0048_text_document -0.0009115948236386599 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0049_text_document -0.0009109761446993803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0050_text_document -0.0009097199236925156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0051_text_document -0.0009103946801923116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0052_text_document -0.0009109038594994949 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0053_text_document -0.0009098133932243314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0054_text_document -0.0009111744494635876 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0055_text_document -0.0008961257268851344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0056_text_document -0.0008499219991848833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0057_text_document -0.000848817192629684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0058_text_document -0.0008469931268429987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0059_text_document -0.0008487804660301039 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0060_text_document -0.0008535293627452302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0061_text_document -0.0008508082359285502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0062_text_document -0.000847764423021283 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0063_text_document -0.0008661814491784624 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0064_text_document -0.0012598427266996145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0065_text_document -0.0015411645064455006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0066_text_document -0.0015500690406153115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0067_text_document -0.0010431702414192465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0068_text_document -0.0010103298065465376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0069_text_document -0.0009173697763272889 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0070_text_document -0.0009149081716719212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0071_text_document -0.0009223001515794829 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0072_text_document -0.0009231205497115238 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0073_text_document -0.0009205400022638854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0074_text_document -0.000921891356231865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0075_text_document -0.0009206550523916788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0076_text_document -0.000919101114727538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0077_text_document -0.0009189314293443922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0078_text_document -0.0009187845413397615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0079_text_document -0.0009212488966514148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0080_text_document -0.0009193937503280587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0081_text_document -0.0013803871878583557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0082_text_document -0.0009950213666737198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0083_text_document -0.000927893134699511 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0084_text_document -0.0009256115426841411 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0085_text_document -0.0009245248815034989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0086_text_document -0.0009239324963431647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0087_text_document -0.00093017264782812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0088_text_document -0.0009246774971430524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0089_text_document -0.0009246651817682976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0090_text_document -0.0009220962135479767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0091_text_document -0.0009218191222144196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0092_text_document -0.0009271314108370893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0093_text_document -0.0011393174361636815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0094_text_document -0.0010056046636817732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0095_text_document -0.000985188940051775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0096_text_document -0.0009834908338499898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0097_text_document -0.0009841221104671695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0098_text_document -0.0009846688252964021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0099_text_document -0.0009846837273836892 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0100_text_document -0.000983200779763785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0101_text_document -0.000983626091844726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0102_text_document -0.0009227550215195058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0103_text_document -0.0008517634745985513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0104_text_document -0.0009820984183696825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0105_text_document -0.001062956613371643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0106_text_document -0.0009446580160861343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0107_text_document -0.000849273787178016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0108_text_document -0.0010838798124933814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0109_text_document -0.0016259767652594482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0110_text_document -0.0009261166233974987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0111_text_document -0.0013044836937627727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0112_text_document -0.0017111272224419217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0113_text_document -0.0017274616815008634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0114_text_document -0.0017204942871235126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0115_text_document -0.0017119592701771347 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0116_text_document -0.0016979912192342588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0117_text_document -0.001701886248500233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0118_text_document -0.0017227272126357288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0119_text_document -0.0017014517255794117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0120_text_document -0.0016995002579026628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0121_text_document -0.0016958447424626011 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0122_text_document -0.0017111887981161064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0123_text_document -0.0017172926007805738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0124_text_document -0.0016938659465618113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0125_text_document -0.0016877576226485259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0126_text_document -0.0017144361080061983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0127_text_document -0.0017173753931755767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0128_text_document -0.001713308056226134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0129_text_document -0.0017126769067653286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0130_text_document -0.0017129095633438736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0131_text_document -0.001704961253905759 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0132_text_document -0.0009282082505873367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0133_text_document -0.0007973220067601047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0134_text_document -0.0008407445714413182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0135_text_document -0.0008403726198530843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0136_text_document -0.0008371632157580058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0137_text_document -0.0013060325919558903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0138_text_document -0.0014100060700040244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0139_text_document -0.0008750222172256031 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0140_text_document -0.0016918433420911735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0141_text_document -0.001838605753011377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0142_text_document -0.0016004536814984726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0143_text_document -0.0011738110086663097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0144_text_document -0.0011269892510041232 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0145_text_document -0.0011251329530758676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0146_text_document -0.0011788404279377853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0147_text_document -0.0007876495048700586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/stack-v4-train-0148_text_document diff --git a/ALCF/data-lists/sunspot/data_file_list_wiki.txt b/ALCF/data-lists/sunspot/data_file_list_wiki.txt deleted file mode 100644 index 65169566eb..0000000000 --- a/ALCF/data-lists/sunspot/data_file_list_wiki.txt +++ /dev/null @@ -1,2 +0,0 @@ -0.0035577638528123345 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/en_simple_wiki_v0-0000_text_document -0.0018422361471876658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_Llama2Tokenizer/en_simple_wiki_v0-0001_text_document diff --git a/ALCF/data-lists/sunspot/falcon.txt b/ALCF/data-lists/sunspot/falcon.txt new file mode 100644 index 0000000000..0b2fd6d43f --- /dev/null +++ b/ALCF/data-lists/sunspot/falcon.txt @@ -0,0 +1,501 @@ +0.0003547982093445404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document +0.00035934014428504944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document +0.00035707704501371544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document +0.00035287930712815354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document +0.00035977166728996823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document +0.0003581675664109838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document +0.0003548617059697185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document +0.0003639582000286208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document +0.00035375839698688127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document +0.0003743722020080678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document +0.0003530399715341242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document +0.00035511875882752406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document +0.0003618733574783154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document +0.00035185243285420104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document +0.0003541503739732106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document +0.0003631679485751914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document +0.00035748045578182274 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document +0.0003606490690555877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document +0.0003626383296610091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document +0.00035442644361264756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document +0.00035978370170539796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document +0.0003585562375341541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document +0.0003601958372888019 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document +0.000350277765402227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document +0.0003616521184211704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document +0.0003620625543608188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document +0.0003560781983850704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document +0.0003553209610592676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document +0.00035905348643915075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document +0.00034744258805696526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document +0.00035462784035661496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document +0.00034768186175100895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document +0.0003568534635532736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document +0.00035586511544371234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document +0.0003524567827568137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document +0.0003512453770426313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document +0.0003591792726468799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document +0.0003514024529343127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document +0.0003584880112586934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document +0.00035133552916418045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document +0.0003600811981350215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document +0.0003571663974228119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document +0.00035768103378874214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document +0.00035939205561113694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document +0.00035186773916029825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document +0.0003542829672490847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document +0.0003592783642898726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document +0.0003556367340099302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document +0.00035391392271377027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document +0.00035486725707484836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document +0.00034866743396828035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document +0.0003517219808644735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document +0.00034874458549673823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document +0.000355773136961014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document +0.00035611750387841917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document +0.00035305602013916315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document +0.0003578207127071924 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document +0.00035514635841943707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document +0.00034816946212866206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document +0.0003512707269761496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document +0.0003483392117980654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document +0.0003572169607204321 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document +0.00035139153281660794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document +0.00035536422129036537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document +0.000352017164107143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document +0.000351889550179365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document +0.000358759689953589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document +0.0003569286079869268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document +0.0003657752958602099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document +0.00035396127934790697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document +0.0003618565071224743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document +0.00035146051531973204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document +0.00036107135765783567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document +0.00035019554279994576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document +0.00035567858879904983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document +0.0003504753174793183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document +0.00035931140831329194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document +0.0003502967866002823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document +0.0003532911801041972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document +0.0003583543013070199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document +0.0003566243489931224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document +0.0003468752314799221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document +0.0003597840618138091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document +0.00035128822484768084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document +0.00035889496943437507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document +0.000352400524650424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document +0.0003518689536768735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document +0.00035866864741303467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document +0.0003454687659106334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document +0.00035348007259317576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document +0.0003539752270940644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document +0.00035146495994081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document +0.00035397212846310423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document +0.00035208246467162587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document +0.0003490843168676626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document +0.00035299633658644394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document +0.00034868327466167065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document +0.00035941351365601583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document +0.0003545343062735255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document +0.0003528956380445978 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document +0.0003553355770443352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document +0.0003644224004937743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document +0.00035234291036216907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document +0.0003596237469847771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document +0.0003531996065735989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document +0.0003547177054106099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document +0.0003575586499260483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document +0.00035262635135283667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document +0.0003624191962188944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document +0.0003488398052948616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document +0.0003598294093147917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document +0.00035583006534466323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document +0.00035403139653225103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document +0.00036134702642187156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document +0.0003573689927162834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document +0.0003577141131435527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document +0.00035208814419277406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document +0.00035996720683665625 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document +0.00035415304658912596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document +0.00036353353029443546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document +0.0003537326003150983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document +0.00036053976358299083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document +0.000352380489373494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document +0.00036154661616900994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document +0.00035959332325963614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document +0.0003597954667189692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document +0.0003563108270597542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document +0.0003582891940460143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document +0.0003497728210484297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document +0.0003549834902179354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document +0.0003529828233484542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document +0.00034627483903285777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document +0.00035569006572589215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document +0.00035449377946910314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document +0.00035802844396194623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document +0.0003617277809353208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document +0.00035034118898654814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document +0.000351091193908611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document +0.0003527914342210668 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document +0.00035028288369781376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document +0.00035775745592780506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document +0.0003449630690661468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document +0.0003583490698830361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document +0.0003476995746684122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document +0.0003535632505019212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document +0.00035640180641147417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document +0.000361731045691765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document +0.0003534082129597368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document +0.0003550344149828664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document +0.00035363002411364057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document +0.0003537265579677396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document +0.00034950531383577937 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document +0.00035008511827347514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document +0.00035594533400871325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document +0.00035266312861335946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document +0.00035280268794863923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document +0.0003565470391528536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document +0.0003588492322689137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document +0.00035469909697832775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document +0.00034712082813410526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document +0.000348701157101807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document +0.0003500192014479944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document +0.00035120560544669755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document +0.00035403656850437445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document +0.00035852376560749366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document +0.0003534754068111774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document +0.00035591740046720765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document +0.000348522354782563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document +0.0003533533959664415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document +0.00035631425964030697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document +0.0003485886551574741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document +0.00035917652631065777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document +0.0003482975272111288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document +0.00035580661277480167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document +0.0003492290722955348 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document +0.00034989284450240613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document +0.0003545677216162781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document +0.00034622286859463484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document +0.00036070626989861965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document +0.00035518365036320786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document +0.00035272907057848406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document +0.0003547343638218734 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document +0.0003496450144966242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document +0.0003537407829294287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document +0.0003489722653985685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document +0.00035057186899911295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document +0.0003507566548933051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document +0.00035630360179023747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document +0.00035631362503416367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document +0.0003490204248026821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document +0.00035761724058371226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document +0.00035037664777467137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document +0.000353402110481068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document +0.00034524163568371745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document +0.00035528523728570974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document +0.00034784916132431703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document +0.00034928476408048925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document +0.00034989205973784984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document +0.00034201664404094254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document +0.0003529676016338611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document +0.00034643433682346637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document +0.0003511666373001904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document +0.00034828669066575333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document +0.0003494625207264413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document +0.0003458957535879216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document +0.0003543020478990003 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document +0.00034754384069014956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document +0.0003598856392240133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document +0.0003503335458553846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document +0.00035919595619778716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document +0.00035767737970754404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document +0.00035197152783998165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document +0.0003549609834422404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document +0.0003568184100569753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document +0.0003512652818651935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document +0.00035912648958665754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document +0.00034764526964056546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document +0.000352439784960359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document +0.00035295886560764226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document +0.0003518132693658672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document +0.00035589987915465713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document +0.00034923863317385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document +0.0003457987267929692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document +0.0003560928663480501 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document +0.0003529603811204932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document +0.0003524438555443043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document +0.0003438847030263783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document +0.00035981978898461613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document +0.0003446342778566972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document +0.00035529584995236537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document +0.00034855740895831116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document +0.00034932634912802544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document +0.00035805518303064666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document +0.0003497941877073061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document +0.00035774398685405447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document +0.0003560421780316607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document +0.0003508844468369392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document +0.00035731928892270107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document +0.0003557884626314314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document +0.00034992996760289355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document +0.000360752554360921 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document +0.0003452321668708545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document +0.0003591745226131023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document +0.00035256981433229084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document +0.00035378123159712034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document +0.000350464354895999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document +0.00035074625557389677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document +0.00035025894701994667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document +0.00035437902514857614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document +0.0003514684519732232 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document +0.00035449717909633905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document +0.0003436816402714221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document +0.00035139158071782116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document +0.0003509424079843335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document +0.000343894618577506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document +0.0003500789770661659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document +0.0003407788080680086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document +0.0003581908175239701 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document +0.0003465541618780918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document +0.00034600228792437736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document +0.00034416738982773204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document +0.0003519900340150641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document +0.000343369616864659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document +0.0003544993883274688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document +0.0003504441365073392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document +0.00034859160702727056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document +0.00035355909532647185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document +0.0003471900922691849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document +0.0003563015508709187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document +0.0003487888744148821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document +0.00034711767548688336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document +0.0003530734609369085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document +0.00035123969242560935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document +0.0003517127620891489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document +0.00035232835416868673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document +0.0003524437481912308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document +0.0003525996167005602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document +0.00035064770545242043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document +0.00035311558274981226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document +0.00034952204800569914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document +0.0003541471367344846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document +0.00035418812454561825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document +0.0003528951372900714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document +0.0003542338042975688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document +0.00034937738939942796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document +0.0003522182190878447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document +0.0003501406466507449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document +0.00034973079877492633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document +0.0003485274567713538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document +0.00034999308679368985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document +0.0003570051724707296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document +0.00034567230462019706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document +0.00035529000940160696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document +0.00034956512308671755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document +0.0003496962834028953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document +0.0003468745282493457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document +0.0003502717155809202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document +0.0003556240880896514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document +0.0003515109488424343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document +0.0003563156688192592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document +0.00035040277363989817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document +0.0003481408593290717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document +0.0003624575124332874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document +0.0003522684124250313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document +0.00035286996027653544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document +0.00034967623997256725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document +0.00035182649587602765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document +0.0003524892557026489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document +0.0003507642477451811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document +0.00036190408389835666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document +0.00035102739424880766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document +0.00035239718753257265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document +0.00035298076121821316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document +0.0003478704389752654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document +0.0003503109191567942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document +0.00035143250975654426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document +0.0003480663923069012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document +0.00035691540219998623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document +0.000348815437166351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document +0.00035202073257766225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document +0.0003491569096274706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document +0.00035277390475511834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document +0.0003524972090026609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document +0.0003504854249750236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document +0.00034740238025423914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document +0.00034968015462277606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document +0.0003493798632762674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document +0.0003488202537862122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document +0.0003525461864643725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document +0.00034903815232825664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document +0.00035536982539258216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document +0.00034858083265155483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document +0.0003505014973608067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document +0.00035327984042622104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document +0.0003503286677453136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document +0.00035835274842442816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document +0.00034970302660275595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document +0.000357929573140149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document +0.0003517238649788585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document +0.00036097027318848475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document +0.0003502734074110026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document +0.00035801510806036273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document +0.0003568006373479869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document +0.00036128108717454636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document +0.0003563436883111686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document +0.00035559725321852463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document +0.00035089656006854944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document +0.000359453964362057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document +0.00035629498059104033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document +0.0003622207707090437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document +0.0003540946784512821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document +0.0003594750565232011 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document +0.0003566007415086991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document +0.0003562142599126134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document +0.0003569948186744601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document +0.00035166554847920186 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document +0.00035047994419295137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document +0.0003561578193739437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document +0.00035470866838811544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document +0.00034216920464876335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document +0.0003550021513075795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document +0.0003488045105938729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document +0.0003513340720840151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document +0.0003448558566387584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document +0.0003460966026953241 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document +0.0003488157616036459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document +0.0003446120387842362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document +0.000351528602987427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document +0.00035661118227454713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document +0.0003551342699877457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document +0.0003478953397924445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document +0.00034625782458988215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document +0.0003527515447405871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document +0.00034823744889805696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document +0.00034823314560254406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document +0.00035162668292961944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document +0.0003477307716074623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document +0.0003446457989477787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document +0.00034782916273767795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document +0.0003517249130302248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document +0.0003449873430908556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document +0.00034841291749669877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document +0.0003466028498941749 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document +0.0003486436831199424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document +0.0003478279234211838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document +0.0003495903653274374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document +0.00034896893881218957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document +0.000348941645312426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document +0.0003474221308416894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document +0.0003462621543839385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document +0.0003669373860863891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document +0.00034691156268163006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document +0.0003527774103765281 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document +0.00034684565672734663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document +0.0003454250599604457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document +0.0003541536557159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document +0.000345735737037366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document +0.0003524669816385214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document +0.0003441817133096468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document +0.0003519093265859089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document +0.00035080085480352095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document +0.00035285227929327434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document +0.00034354836346901676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document +0.00034789770937373467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document +0.000343665920520102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document +0.0003490884931060568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document +0.00034380029463398654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document +0.00034874768005099945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document +0.0003457058510967673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document +0.00034644265227023904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document +0.00035008339858594957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document +0.0003462377193296194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document +0.0003620491787114201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document +0.000348717011044469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document +0.00034370072363913706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document +0.0003551981066775649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document +0.0003500119496799342 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document +0.0003485082952669081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document +0.0003508155580978919 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document +0.00035311375163251416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document +0.00034945972003423253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document +0.0003474220353789879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document +0.0003536443686585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document +0.0003560350489042953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document +0.0003493655927914396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document +0.0003528423977146383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document +0.00035255554724471217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document +0.0003479760010190111 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document +0.00035458598862501956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document +0.0003458990560538315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document +0.00035157946422379875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document +0.00034736860650169996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document +0.0003529152313394119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document +0.00034586294329524465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document +0.00035707214923794877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document +0.0003509580363496512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document +0.00035244176725524474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document +0.0003467539557999047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document +0.00034919687962275546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document +0.00035094031731719953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document +0.0003484309008351352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document +0.0003485409424916253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document +0.0003499590776117838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document +0.0003492842758957848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document +0.0003529712275178912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document +0.0003566141287087449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document +0.0003649496522047409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document +0.0003563218912208234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document +0.00035614782126966145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document +0.0003531944298453266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document +0.0003535950949566616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document +0.0003544295554928795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document +0.0003519908503740376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document +0.00035752817626134463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document +0.0003515322689589972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document +0.0003486893890307115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document +0.0003446520464889867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document +0.0003509421562481707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document +0.00035335015702909084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document +0.0003490178167345008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document +0.0003520497821155174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document +0.0003549762618908944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document +0.00035072190850833103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document +0.0003542458638526423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document +0.000352419194572916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document +0.0003545102564672614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document +0.0003495437992331806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document +0.0003542843376993964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document +0.000352827529313958 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document +0.00035442506093223886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document +0.0003496970719044257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document +0.0003553096424442362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document +0.00034986845565067564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document +0.000352131055186658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document +0.0003527021708198983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document +0.00034905885414547214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document +0.0003583433842468394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document +0.00034409435202828383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document +0.00034846410520871483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document +0.0003554459991927314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document +0.00035310507471843076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document +0.000350028910786098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document +0.00035049727458009896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document +0.0003519047735925826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document +0.0003513027429919726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document +0.0003626947260354396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document +0.0003500087324849783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document +0.0003618315726725285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document +0.0003535385113938023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document +0.0003487064058517615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document +0.0003618709124780938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document +0.00035040070335625915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document +0.0003506279032267829 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document +0.0003498435310527524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document +0.0003554634749821431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document +0.00035091209738758963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document +0.00035034103678978573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document +0.00035398931854386146 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document +0.00035495529304989485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document +0.00036067883473356603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document + diff --git a/ALCF/data-lists/sunspot/megawiki.txt b/ALCF/data-lists/sunspot/megawiki.txt new file mode 100644 index 0000000000..9fc9ca5dab --- /dev/null +++ b/ALCF/data-lists/sunspot/megawiki.txt @@ -0,0 +1,262 @@ +6.322825248625475e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document +2.4432314037946264e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document +5.6313888721313454e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document +2.4208171781595055e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document +2.325811856369237e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document +2.4010790356322705e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document +5.36773610843632e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document +1.360574433501002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document +1.3076540344853244e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document +1.3386534334886313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document +1.2498103719605153e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document +1.403763836949682e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document +1.3636756723495417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document +1.2242489446940814e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document +1.2398255818973339e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document +1.2972616994216281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document +1.3947809855914134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document +1.3144843787829514e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document +1.1693809976572487e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document +1.3677252682893802e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document +1.3940876719849597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document +1.4222245138730965e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document +1.3201677767919704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document +1.1421717796486169e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document +1.2890514724498703e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document +1.3649507648749037e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document +1.2400732563490717e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document +1.1557681453277616e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document +1.2294483595964517e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document +1.2137484472122283e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document +1.3299663426456e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document +1.2461984216479532e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document +1.4666434217609636e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document +1.1876997894686238e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document +1.2939155338964078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document +1.3859590039728515e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document +1.317917848615668e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document +1.1335281536110342e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document +1.2889923952861426e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document +1.3471671647053326e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document +1.2221720014475102e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document +1.2632647276287541e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document +1.28276219004076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document +1.36213704321643e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document +1.2414858625261553e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document +1.3173700421883744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document +1.295597796725686e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document +1.242783936442904e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document +1.2417374088427464e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document +1.2134479405400744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document +1.3090040663304255e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document +1.2713470581614905e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document +5.5750231378906594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document +5.777597358425469e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document +5.349786767471258e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document +5.675165050453583e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document +5.482611216158831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document +5.065421899890121e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document +5.384718357480146e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document +4.872037363236061e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document +4.532709250783155e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document +5.7257963030489613e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document +4.9014365579652036e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document +5.722863552770969e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document +6.149911636146833e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document +5.2178057608273506e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document +4.990228161160431e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document +5.866186875255134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document +5.004185734360719e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document +4.79401853705107e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document +5.435219965052376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document +5.035997225792266e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document +5.622401774211625e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document +5.028826157387559e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document +5.596379470128795e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document +6.027824493191489e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document +5.5358270009931474e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document +5.9839051807685496e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document +5.1221077499249595e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document +5.517228560620279e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document +5.1687858285052305e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document +5.684188244145645e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document +5.212693275535878e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document +4.8551007022784084e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document +5.4888506639203145e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document +5.345098688527242e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document +4.8506420625516594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document +5.132168603397676e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document +5.719476795114223e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document +5.7448621149792696e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document +4.9068410568059265e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document +5.382937299647678e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document +4.8288432136304634e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document +5.841703200305416e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document +5.1589611587885584e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document +6.031113829732574e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document +5.4558202844532094e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document +5.341852317196142e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document +5.1402942738369954e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document +5.735421384377395e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document +5.473629863586958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document +5.4708993245733936e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document +4.931161863634078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document +5.104173022127248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document +5.510157161510824e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document +5.652501401782597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document +5.7273656573031666e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document +5.638363224821738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document +5.6128115396668704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document +5.00304877998141e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document +5.596120554779096e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document +5.5280923889040006e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document +5.223477917938408e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document +5.29472809986569e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document +2.205682378243213e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document +1.4367563720603185e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document +3.5506193487931076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document +3.0442910855821778e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document +2.2540042508019627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document +2.6880163202623216e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document +2.534473148048727e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document +2.6560945431318916e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document +2.547470248967691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document +2.5248825388073738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document +2.5828729575000054e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document +2.4026583817957736e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document +2.3930425429834413e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document +2.5037365362599724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document +2.6696745470595603e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document +2.140323051341762e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document +2.617354786691592e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document +1.538359101762691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document +1.2871029252377856e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document +2.255195411289217e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document +2.4832313897952067e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document +9.303873918189968e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document +2.179532302620228e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document +1.9750517506901206e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document +2.7740420380648435e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document +2.7813714782319335e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document +4.1595357937609806e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document +2.741365122389175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document +2.117451071361901e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document +1.7132649760565998e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document +1.7492547092602047e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document +1.7499951097392276e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document +1.6632444789170958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document +1.6678802252361607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document +1.5519208704558896e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document +1.652420992967167e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document +1.6119931034508755e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document +1.6638882076736552e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document +1.7198076782652946e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document +1.572927860565175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document +1.5194822618169918e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document +1.6677776832669846e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document +1.595612492245688e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document +1.682350633181197e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document +1.663983380609724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document +1.710187842689243e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document +1.5733697527539038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document +1.6972104757911438e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document +1.6610142847616577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document +1.61094882403031e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document +1.4789207305138325e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document +1.639299617676302e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document +1.3241204512116132e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document +8.582260726625535e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document +8.213000975576739e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document +9.549247732811947e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document +9.17242785339013e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document +7.632868223725218e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document +8.674401118222175e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document +9.124384255505347e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document +8.344222222417358e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document +8.992299957499065e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document +8.76689497361025e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document +7.973396239586015e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document +9.006935606644125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document +8.725545954955498e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document +1.215449694669174e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document +3.3041720284158646e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document +2.0593512412624502e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document +1.893608946986248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document +1.737111666788535e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document +1.4915923449873955e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document +2.289370239067605e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document +2.8615335689614638e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document +8.847283630883125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document +1.8175470362373804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document +1.8152226683368038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document +1.789149655314284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document +1.7690523036477663e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document +1.8333732213753644e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document +1.8794105687718654e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document +1.721841156706417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document +2.0612008685724796e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document +1.9297370681336376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document +2.0188440409661018e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document +5.1741216329695265e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document +1.3417913926038429e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document +1.1010813016469651e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document +1.1252416134320087e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document +1.2801744104313002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document +1.3041514955795817e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document +1.3428837580879075e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document +1.320809382267804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document +1.3451566676555968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document +1.228284926657501e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document +1.2410599573923043e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document +1.3815343367377182e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document +1.3895126265148832e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document +1.2306773644401741e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document +1.32981021906281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document +1.101337469221607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document +1.513094184404692e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document +1.1073759547073234e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document +1.2879348765857567e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document +9.619595770228435e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document +1.2384340836286436e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document +1.1766667232211577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document +1.2871049236196452e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document +1.2010645926497744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document +1.3971428231518597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document +1.2283733550547932e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document +1.2659530508255308e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document +1.551775613074462e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document +1.1169413343776979e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document +1.1433700593712463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document +4.964773647323492e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document +1.0995586595687313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document +1.2957393071411267e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document +2.75899247407709e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document +2.8269344597344854e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document +2.329108187246831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document +2.4231761430460284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document +1.2434140512230442e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document +1.638718338352859e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document +3.272953556801187e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document +6.061314500486327e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document +1.2465979731210292e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document +1.2737557327967737e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document +1.038428658075627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document +2.61666472045566e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document +3.6506873212272224e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document +1.5066359138295701e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document +1.1166290872121178e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document +1.5546966228590285e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document +1.2583434625014828e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document +1.3398826881300862e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document +1.2944933160515968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document +1.0971437399901365e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document +1.2787922795775774e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document +1.404979227816985e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document +1.3344734431324463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document +4.886031157107555e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document +3.277261443596394e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document +3.5057957685786495e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document +3.287625301718589e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document +3.1370056372668855e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document +3.186092015785841e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document +7.271819324142512e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document diff --git a/ALCF/data-lists/sunspot/open-web-math-train.txt b/ALCF/data-lists/sunspot/open-web-math-train.txt new file mode 100644 index 0000000000..b36e9977c0 --- /dev/null +++ b/ALCF/data-lists/sunspot/open-web-math-train.txt @@ -0,0 +1,13 @@ +0.001451215788905126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document +0.0014486847196258788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document +0.0008861032722895899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document +0.0018119590809459816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document +0.0008916937917547129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document +6.960128832809415e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document +0.002008403651063623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document +0.0014374900742131454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document +0.00180213596996716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document +0.001956178877532413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document +0.0008829547017667033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document +0.0008910853619157279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document +0.0018260998845299973 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document diff --git a/ALCF/data-lists/sunspot/pes2o.txt b/ALCF/data-lists/sunspot/pes2o.txt new file mode 100644 index 0000000000..63f805c06d --- /dev/null +++ b/ALCF/data-lists/sunspot/pes2o.txt @@ -0,0 +1,26 @@ +0.0012499632072059553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document +0.00125398260359913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document +0.0012541704774729071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document +0.0012527268234360602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document +0.0012532925243737164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document +0.0012456396241204315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document +0.0012589894424352072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document +0.001508020123999618 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document +0.00333096950781965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document +0.0033233414614415547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document +0.003512387990689828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document +0.0035091382940513126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document +0.003514155927147005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document +0.003327108000579638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document +0.003329106196589836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document +0.003505604148738077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document +0.003324825759567855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document +0.0033248240149804913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document +0.0033385962112851358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document +0.0035043186296553615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document +0.003340469505431529 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document +0.0035106889084796276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document +0.0033309469281030167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document +0.003340337858029757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document +0.003505919861097801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document +0.0003882924098240512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document diff --git a/ALCF/data-lists/sunspot/reddit.txt b/ALCF/data-lists/sunspot/reddit.txt new file mode 100644 index 0000000000..59eafce1ee --- /dev/null +++ b/ALCF/data-lists/sunspot/reddit.txt @@ -0,0 +1,78 @@ +0.0005759963691850877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document +0.0005959971675332674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document +0.0006026179290353799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document +0.0005824184320784846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document +0.0005854598548616037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document +0.0005903767055633473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document +0.0005930306490982049 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document +0.000569425602700746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document +0.0005675060415179408 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document +0.0005772431621253389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document +0.0005678026053826858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document +0.0005700398263483378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document +0.0005669467963528824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document +0.0005701015953324305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document +0.0005795907287413296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document +0.0005735602737531164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document +0.0005749862745842101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document +0.0005693257015931971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document +0.0005716568794795563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document +0.0005761083919774021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document +0.0005688343169797355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document +0.0005807913190929842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document +0.0005710229258078636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document +0.0005704083039826862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document +0.0005862132348308056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document +0.0005717662049559556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document +0.0005858155213694451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document +0.0005812012281792392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document +0.0005803981414588498 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document +0.0005700102108287723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document +0.0005719243459052329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document +0.0005867253401661752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document +0.0005731087218860733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document +0.0005712197789109317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document +0.0005702376926310089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document +0.0005700411527742972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document +0.0005828090098178196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document +0.0005770140826168056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document +0.0005723509664597896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document +0.0005755499231836962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document +0.0005636407438471367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document +0.0005640281556500104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document +0.0005633159058766496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document +0.0005638034311151449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document +0.0005630066273073224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document +0.0005631803831128559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document +0.0005631228881679657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document +0.0005628178701487633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document +0.0005624448092256196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document +0.0005620957024062329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document +0.0005614201504177484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document +0.0005616890951464056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document +0.0005611348559279058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document +0.0005604238061828518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document +0.0005603301490194237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document +0.0005607291294548833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document +0.0005605234569930727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document +0.0005613778566640694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document +0.0005610248539992471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document +0.0005599977416780475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document +0.0005603632562116935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document +0.0005599177479509897 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document +0.0005595202318298379 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document +0.0005600975633499175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document +0.0005614075491213365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document +0.000612563885043477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document +0.0005515469909644413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document +0.0005526782014946906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document +0.0005472463408095445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document +0.0005502284746004587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document +0.0005414514790555363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document +0.0005513499500134784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document +0.0005391391454105187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document +0.0005415836910001838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document +0.0005208132468536551 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document +0.0005889827143132871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document +0.0005822520817765276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document +0.0004173155230758696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document diff --git a/ALCF/data-lists/sunspot/stack.txt b/ALCF/data-lists/sunspot/stack.txt new file mode 100644 index 0000000000..297783ac22 --- /dev/null +++ b/ALCF/data-lists/sunspot/stack.txt @@ -0,0 +1,26 @@ +0.0009994361338078242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document +0.001087156194657966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document +0.0010667737163656816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document +0.0009602877882124873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document +0.0008968956271971105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document +0.0009198034843762967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document +0.0009423901016715341 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document +0.0009674094553686345 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document +0.0009858331322519164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document +0.0009970593645879198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document +0.0010027035193731686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document +0.0010128291154221853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document +0.0010215631382631918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document +0.0010288663771461238 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document +0.0010346219929285867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document +0.00104544019940344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document +0.0010525172676724333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document +0.0010609529620775127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document +0.0010725892748610153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document +0.0010818563598181568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document +0.0010992760196793917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document +0.0011178992762079917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document +0.001124687532085676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document +0.001118303661267191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document +0.0010206825575416534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document +0.0005512280117499715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document diff --git a/ALCF/data-lists/sunspot/starcoder.txt b/ALCF/data-lists/sunspot/starcoder.txt new file mode 100644 index 0000000000..37e6333de5 --- /dev/null +++ b/ALCF/data-lists/sunspot/starcoder.txt @@ -0,0 +1,50 @@ +0.004474659408857016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document +0.00409944473890653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document +0.005137179939941845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document +0.005143172251066109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document +0.005206134363352808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document +0.004892747858974329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document +0.004844731352552902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document +0.005308320169123755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document +0.005124709815666577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document +0.005424710744483826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document +0.00538244648861977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document +0.0029107284679086853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document +0.0026825258998444705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document +0.0026904503191419243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document +0.002687906577174073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document +0.002850165346048818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document +0.005322698571717847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document +0.004450334290869719 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document +0.004700990083440683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document +0.003903568556500995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document +0.00390561515396931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document +0.0039046402900912262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document +0.003907454839379547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document +0.0038583224578603824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document +0.0037914116657695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document +0.003786665266798682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document +0.003792000802430658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document +0.00319266847466091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document +0.0032658716699838944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document +0.0034801959532460023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document +0.0028307012092022594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document +0.0028420360878146276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document +0.0028410455248484914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document +0.00283497183526842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document +0.002840187195459487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document +0.0028398709431369834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document +0.004364722843422023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document +0.004093255713117101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document +0.004092331079566252 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document +0.004005326985579649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document +0.0036205502856964207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document +0.003625316793034984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document +0.003604743435602363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document +0.0035405823343673125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document +0.0041601413517253945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document +0.005886303658937057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document +0.003600909532810332 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document +0.0034941365817168658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document +0.0004992164842980224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document + diff --git a/ALCF/data-lists/sunspot/tulu.txt b/ALCF/data-lists/sunspot/tulu.txt new file mode 100644 index 0000000000..2b75802501 --- /dev/null +++ b/ALCF/data-lists/sunspot/tulu.txt @@ -0,0 +1,66 @@ +0.00032927705604725614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document +0.0002860154190878753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document +0.0002845217585425619 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document +0.0002743528685497456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document +0.00026025323737738766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document +0.00023493876414603155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document +0.00029665994994226705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document +0.00031808102075993956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document +0.00031813573046011285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document +0.0002711905171855542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document +0.00028892513401817095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document +0.00030003908676979083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document +0.00026839878771944684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document +0.00029155935002690497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document +0.0002998624927624209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document +0.0003091705447974841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document +0.00026873195794309786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document +0.00027721873498527547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document +0.0002841662554024377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document +0.0002839461156551537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document +0.0002861705604659811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document +0.0002460995649635886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document +0.00019420142619795496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document +0.00021967677816173628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document +0.0002620283200480949 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document +0.0002433390542188936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document +0.00021254976608350767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document +0.00022094815569522115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document +0.000342862378668244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document +0.00033784225259118157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document +0.0003367278459543952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document +0.00029843279042852765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document +0.0002926583661257988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document +0.00029320337282010673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document +0.00029281450669483455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document +0.0002915338187002653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document +0.0002864226923084572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document +0.00028643439083586396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document +0.00028253710956299054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document +0.0002810856078805806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document +0.00031474941344656715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document +0.0002139130222205655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document +0.0003084648871862831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document +0.0003309477872140129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document +0.0003360096824695161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document +0.0003355452655196557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document +0.00038119390366386037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document +0.00038078927630086064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document +0.0003386200917551554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document +0.0002158905159938882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document +0.00021621682877018768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document +0.00021553306942740535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document +0.00021581563462722296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document +0.0002157694110556169 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document +0.000215643699847159 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document +0.00021532716715168094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document +0.00021531221326022472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document +0.0002831801179028896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document +0.0002514844936507595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document +0.00031638782778107964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document +0.0002749197545278445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document +0.00026159721512464495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document +0.0002630052420096968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document +0.00031106811228913666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document +0.0002852973415334161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document +3.7555372465932136e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document diff --git a/ALCF/data-lists/sunspot/wiki.txt b/ALCF/data-lists/sunspot/wiki.txt new file mode 100644 index 0000000000..52af00d57b --- /dev/null +++ b/ALCF/data-lists/sunspot/wiki.txt @@ -0,0 +1,2 @@ +0.003548077173506675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document +0.0018372203137874265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document From 3648af572d926278429f35045abfd72eb1d76a76 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 17:51:23 -0500 Subject: [PATCH 205/268] Add `ALCF/test_sunspot.sh` --- ALCF/test_sunspot.sh | 48 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100755 ALCF/test_sunspot.sh diff --git a/ALCF/test_sunspot.sh b/ALCF/test_sunspot.sh new file mode 100755 index 0000000000..a8a4a21f32 --- /dev/null +++ b/ALCF/test_sunspot.sh @@ -0,0 +1,48 @@ +#!/bin/bash --login +# +# Run complete test of +# https://github.com/argonne-lcf/Megatron-DeepSpeed +# on Sunspot @ ALCF + +# EXIT ON ERROR(s) +set -euxo pipefail + +######################################################## +# Setup / activate conda environment, +# mine is called q4-drop +######################################################## +setup_conda() { + if [[ "${SHELL}" = "/bin/zsh" ]]; then + eval "$(~/miniconda3/bin/conda shell.zsh hook)" + else + eval "$(~/miniconda3/bin/conda shell.bash hook)" + fi + conda activate q4-drop +} + + +######################################## +# Make sure ./tmp/Megatron-DeepSpeed +# does not already exist +######################################## +setup_megatron_deepspeed() { + mkdir tmp && cd tmp + if [[ -d "Megatron-DeepSpeed" ]]; then + # rm -rfv Megatron-DeepSpeed/ + echo "Found existing Megatron-DeepSpeed. + Remove existing directory to run test." + exit + fi + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + git checkout remove-apex-deps +} + + +main() { + setup_conda + setup_megatron_deepspeed + # NOTE: to use OPT=adamwschedulefree, you will need to pip install schedulefree + DEBUG=1 PBS_O_WORKDIR="$(pwd)" DATA_FILE_LIST=./ALCF/data-lists/sunspot/books.txt LR=0.0008 GRAD_ACC_STEPS=8 ZERO_STAGE=1 NUM_LAYERS=10 MICRO_BATCH=8 OPT=adamwschedulefree TIMING_LOG_LEVEL=1 bash train_llama_alcf.sh +} + +main From 9796eacf3c7392e20ff03aa2ea594ef3eb46ecfd Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 19:09:33 -0500 Subject: [PATCH 206/268] Add `ALCF/data-lists/sirius/books.txt` --- ALCF/data-lists/sirius/books.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 ALCF/data-lists/sirius/books.txt diff --git a/ALCF/data-lists/sirius/books.txt b/ALCF/data-lists/sirius/books.txt new file mode 100644 index 0000000000..7567ba5227 --- /dev/null +++ b/ALCF/data-lists/sirius/books.txt @@ -0,0 +1,3 @@ +0.006 /lus/tegu/projects/PolarisAT/foremans/projects/argonne-lcf/Megatron-DeepSpeed/data/books-0000_text_document +0.006 /lus/tegu/projects/PolarisAT/foremans/projects/argonne-lcf/Megatron-DeepSpeed/data/books-0001_text_document +0.006 /lus/tegu/projects/PolarisAT/foremans/projects/argonne-lcf/Megatron-DeepSpeed/data/books-0002_text_document From 7b2ab6d984ddf4aaa380b8421ae39d1c9dec5d2a Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 19:10:01 -0500 Subject: [PATCH 207/268] Add `ALCF/test_sirius.sh` --- ALCF/test_sirius.sh | 66 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100755 ALCF/test_sirius.sh diff --git a/ALCF/test_sirius.sh b/ALCF/test_sirius.sh new file mode 100755 index 0000000000..273d511afa --- /dev/null +++ b/ALCF/test_sirius.sh @@ -0,0 +1,66 @@ +#!/bin/bash --login +# +# Run complete test of +# https://github.com/argonne-lcf/Megatron-DeepSpeed +# on Sunspot @ ALCF + +# EXIT ON ERROR(s) +set -euxo pipefail + +NOW="$(date "+%Y-%m-%d-%H%M%S")" + +######################################################## +# Setup / activate conda environment, +# mine is called q4-drop +######################################################## +setup_conda() { + export MAMBA_ROOT_PREFIX=/lus/tegu/projects/PolarisAT/foremans/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook --shell ${shell_name})" + micromamba activate 2024-04-23 + # if [[ "${SHELL}" = "/bin/zsh" ]]; then + # eval "$(~/miniconda3/bin/conda shell.zsh hook)" + # else + # eval "$(~/miniconda3/bin/conda shell.bash hook)" + # fi + # conda activate q4-drop +} + + +######################################## +# Make sure ./tmp/Megatron-DeepSpeed +# does not already exist +######################################## +setup_megatron_deepspeed() { + OUTDIR="test-sirius-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + echo "Running test in: ${OUTDIR}" + echo "WORKING DIRECTORY: $(realpath $(pwd .))" + if [[ -d "Megatron-DeepSpeed" ]]; then + # rm -rfv Megatron-DeepSpeed/ + echo "Found existing Megatron-DeepSpeed. + Remove existing directory to run test." + exit + fi + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + git checkout remove-apex-deps +} + + +main() { + setup_conda + setup_megatron_deepspeed + # NOTE: to use OPT=adamwschedulefree, you will need to pip install schedulefree + DEBUG=1 + PBS_O_WORKDIR="$(pwd)" + DATA_FILE_LIST=./ALCF/data-lists/sirius/books.txt + # LR=0.0008 + # GRAD_ACC_STEPS=8 + ZERO_STAGE=1 + NUM_LAYERS=10 + MICRO_BATCH=8 + # OPT=adamwschedulefree + TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee test-sirius.log +} + +main From 58cdccaa2438ac1da2a310652df3dc410e7a1138 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 19:17:30 -0500 Subject: [PATCH 208/268] Update `ALCF/test_sirius.sh` --- ALCF/test_sirius.sh | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/ALCF/test_sirius.sh b/ALCF/test_sirius.sh index 273d511afa..6199c5c157 100755 --- a/ALCF/test_sirius.sh +++ b/ALCF/test_sirius.sh @@ -2,7 +2,14 @@ # # Run complete test of # https://github.com/argonne-lcf/Megatron-DeepSpeed -# on Sunspot @ ALCF +# on Sirius @ ALCF +# to launch (inside an interactive `qsub -I` job) on Sirius: +# +# ```bash` +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed/ALCF +# $ bash test_sirius.sh +# ```` # EXIT ON ERROR(s) set -euxo pipefail @@ -18,12 +25,6 @@ setup_conda() { shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook --shell ${shell_name})" micromamba activate 2024-04-23 - # if [[ "${SHELL}" = "/bin/zsh" ]]; then - # eval "$(~/miniconda3/bin/conda shell.zsh hook)" - # else - # eval "$(~/miniconda3/bin/conda shell.bash hook)" - # fi - # conda activate q4-drop } @@ -32,7 +33,7 @@ setup_conda() { # does not already exist ######################################## setup_megatron_deepspeed() { - OUTDIR="test-sirius-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + OUTDIR="OUTPUTS/test-sirius-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" echo "Running test in: ${OUTDIR}" echo "WORKING DIRECTORY: $(realpath $(pwd .))" if [[ -d "Megatron-DeepSpeed" ]]; then @@ -49,18 +50,17 @@ setup_megatron_deepspeed() { main() { setup_conda setup_megatron_deepspeed - # NOTE: to use OPT=adamwschedulefree, you will need to pip install schedulefree - DEBUG=1 - PBS_O_WORKDIR="$(pwd)" - DATA_FILE_LIST=./ALCF/data-lists/sirius/books.txt + export DEBUG=1 + export PBS_O_WORKDIR="$(pwd)" + export DATA_FILE_LIST=./ALCF/data-lists/sirius/books.txt # LR=0.0008 # GRAD_ACC_STEPS=8 - ZERO_STAGE=1 - NUM_LAYERS=10 - MICRO_BATCH=8 - # OPT=adamwschedulefree - TIMING_LOG_LEVEL=1 - bash train_llama_alcf.sh |& tee test-sirius.log + export ZERO_STAGE=1 + export NUM_LAYERS=10 + export MICRO_BATCH=8 + export TRAIN_ITERS=20 + export TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee "test-sirius-${NOW}".log } main From 02a955c410429e8d1a39356b89ad4d1c191e88e5 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 19:46:53 -0500 Subject: [PATCH 209/268] Create `alcf-tests` branch --- ALCF/test_sunspot.sh | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/ALCF/test_sunspot.sh b/ALCF/test_sunspot.sh index a8a4a21f32..29ec46fc03 100755 --- a/ALCF/test_sunspot.sh +++ b/ALCF/test_sunspot.sh @@ -3,10 +3,19 @@ # Run complete test of # https://github.com/argonne-lcf/Megatron-DeepSpeed # on Sunspot @ ALCF +# to launch (inside an interactive `qsub -I` job) on Sirius: +# +# ```bash` +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed/ALCF +# $ bash test_sunspot.sh +# ```` # EXIT ON ERROR(s) set -euxo pipefail +NOW="$(date "+%Y-%m-%d-%H%M%S")" + ######################################################## # Setup / activate conda environment, # mine is called q4-drop @@ -26,9 +35,10 @@ setup_conda() { # does not already exist ######################################## setup_megatron_deepspeed() { - mkdir tmp && cd tmp + OUTDIR="OUTPUTS/test-sunspot-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + echo "Running test in: ${OUTDIR}" + echo "WORKING DIRECTORY: $(realpath $(pwd .))" if [[ -d "Megatron-DeepSpeed" ]]; then - # rm -rfv Megatron-DeepSpeed/ echo "Found existing Megatron-DeepSpeed. Remove existing directory to run test." exit @@ -41,8 +51,15 @@ setup_megatron_deepspeed() { main() { setup_conda setup_megatron_deepspeed - # NOTE: to use OPT=adamwschedulefree, you will need to pip install schedulefree - DEBUG=1 PBS_O_WORKDIR="$(pwd)" DATA_FILE_LIST=./ALCF/data-lists/sunspot/books.txt LR=0.0008 GRAD_ACC_STEPS=8 ZERO_STAGE=1 NUM_LAYERS=10 MICRO_BATCH=8 OPT=adamwschedulefree TIMING_LOG_LEVEL=1 bash train_llama_alcf.sh + export DEBUG=1 + export PBS_O_WORKDIR="$(pwd)" + export DATA_FILE_LIST=./ALCF/data-lists/sunspot/books.txt + export ZERO_STAGE=1 + export NUM_LAYERS=10 + export MICRO_BATCH=8 + export TRAIN_ITER=20 + export TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee "test-suntpot-${NOW}".log } main From 23c953124d6025e9aef4eb7516c2d9dbe01169e2 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 19:49:41 -0500 Subject: [PATCH 210/268] Update `ALCF/{test_sirius.sh,test_sunspot.sh}` --- ALCF/test_sirius.sh | 4 ++-- ALCF/test_sunspot.sh | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ALCF/test_sirius.sh b/ALCF/test_sirius.sh index 6199c5c157..19cb9405dc 100755 --- a/ALCF/test_sirius.sh +++ b/ALCF/test_sirius.sh @@ -43,7 +43,7 @@ setup_megatron_deepspeed() { exit fi git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed - git checkout remove-apex-deps + git checkout alcf-tests } @@ -58,7 +58,7 @@ main() { export ZERO_STAGE=1 export NUM_LAYERS=10 export MICRO_BATCH=8 - export TRAIN_ITERS=20 + export TRAIN_ITER=20 export TIMING_LOG_LEVEL=1 bash train_llama_alcf.sh |& tee "test-sirius-${NOW}".log } diff --git a/ALCF/test_sunspot.sh b/ALCF/test_sunspot.sh index 29ec46fc03..ca3a59cc87 100755 --- a/ALCF/test_sunspot.sh +++ b/ALCF/test_sunspot.sh @@ -5,7 +5,7 @@ # on Sunspot @ ALCF # to launch (inside an interactive `qsub -I` job) on Sirius: # -# ```bash` +# ```bash # $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed # $ cd Megatron-DeepSpeed/ALCF # $ bash test_sunspot.sh @@ -44,7 +44,7 @@ setup_megatron_deepspeed() { exit fi git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed - git checkout remove-apex-deps + git checkout alcf-tests } @@ -59,7 +59,7 @@ main() { export MICRO_BATCH=8 export TRAIN_ITER=20 export TIMING_LOG_LEVEL=1 - bash train_llama_alcf.sh |& tee "test-suntpot-${NOW}".log + bash train_llama_alcf.sh |& tee "test-sunspot-${NOW}.log" } main From 5fff0af7bdaafebb75f713ad088edac3be3d2d60 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 20:08:57 -0500 Subject: [PATCH 211/268] Update `pretrain_gpt_alcf.py` --- pretrain_gpt_alcf.py | 65 ++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 6530340c19..fe903e04f7 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -8,7 +8,7 @@ import math from functools import partial from megatron import get_args -from megatron import print_rank_0 +from megatron import log.info from megatron import get_timers from megatron import get_tokenizer from megatron.core import mpu, tensor_parallel @@ -74,7 +74,7 @@ def model_provider(pre_process=True, post_process=True): """Build the model.""" - print_rank_0('building GPT model ...') + log.info('building GPT model ...') see_memory_usage("Before Building Model", force=True) args = get_args() config = core_transformer_config_from_args(args) @@ -118,7 +118,7 @@ def model_provider(pre_process=True, post_process=True): # We need to call model.set_batch_fn after deepspeed.initialize model._megatron_batch_fn = get_batch_pipe - # Predompute the attention mask and store it in args. + # Precompute the attention mask and store it in args. # This avoids having to pipeline it # as an activation during training. # The mask is constant, and thus we can reuse it. @@ -154,12 +154,9 @@ def model_provider(pre_process=True, post_process=True): ) num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) - # print_rank_0('\n ------------------------ ') - # print_rank_0(f'num of parameters {num_params}') - # print_rank_0('------------------------\n ') - print_rank_0(80 * '-') - print_rank_0(f"Number of parameters in model: {num_params}") - print_rank_0(80 * '-') + log.info(80 * '-') + log.info(f"Number of parameters in model: {num_params}") + log.info(80 * '-') see_memory_usage("After Building Model", force=True) if wandb.run is not None: tbdir = args.tensorboard_dir @@ -342,7 +339,7 @@ def loss_func(loss_mask, moe_loss, mos_loss, output_tensor): 'moe loss': moe_loss, 'kd loss': mos_loss } - print_rank_0( + log.info( f'>>> total loss: {loss}, ' f'lm loss {averaged_loss[0]}, ' f'kd loss {mos_loss}' @@ -419,7 +416,8 @@ def forward_step(data_iterator, model): # Get the batch. timers('batch-generator', log_level=2).start() tokens, labels, loss_mask, attention_mask, position_ids = get_batch( - data_iterator) + data_iterator + ) timers('batch-generator').stop() if args.data_efficiency_curriculum_learning: @@ -492,11 +490,12 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): """Build train, valid, and test datasets.""" args = get_args() - print_rank_0('> building train, validation, and test datasets ' - 'for GPT ...') + log.info( + '> building train, validation, and test datasets for GPT ...' + ) files = [] if args.data_file_list is not None: - print_rank_0(f"Reading datasets from {args.data_file_list}") + log.info(f"Reading datasets from {args.data_file_list}") with open(args.data_file_list, 'r') as flist: for f in flist.readlines(): w, fname = f.split() @@ -523,8 +522,9 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): train_data_prefix=args.train_data_path, valid_data_prefix=args.valid_data_path, test_data_prefix=args.test_data_path, - data_cache_path=args.data_cache_path) - print_rank_0("> finished creating GPT datasets ...") + data_cache_path=args.data_cache_path, + ) + log.info("> finished creating GPT datasets ...") return train_ds, valid_ds, test_ds @@ -566,8 +566,6 @@ def git_ds_info(): def main(): - # if RANK == 0: - # setup_wandb() if os.getenv('TORCH_PROFILER_ENABLED') == '1': from torch.profiler import profile, record_function, ProfilerActivity with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: @@ -593,17 +591,26 @@ def main(): # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, data_post_process=data_post_process ) - try: - from megatron.text_generation import generate_and_post_process - with torch.autocast(device_type=ez.get_torch_device(), dtype=torch.float16): - response, _, _, _ = generate_and_post_process(model, prompts=["Hello world", "Nature is", "Turing test comprises", "Explain solar eclipse"], tokens_to_generate=32) - if RANK == 0: - log.info(f'generation completed..\n response:{response}') - except ValueError as ve: - log.critical(f'ValueError: {ve}') - pass + # try: + # from megatron.text_generation import generate_and_post_process + # with torch.autocast(device_type=DEVICE, dtype=args.dtype): + # response, _, _, _ = generate_and_post_process( + # model, + # prompts=[ + # "Hello world", + # "Nature is", + # "Turing test comprises", + # "Explain solar eclipse" + # ], + # tokens_to_generate=32 + # ) + # if RANK == 0: + # log.info(f'generation completed..\n response:{response}') + # except ValueError as ve: + # log.critical(f'ValueError: {ve}') + # pass # dist.barrier() - model.train() + # model.train() return model @@ -623,4 +630,4 @@ def main(): print(f"wandb.run.name: {wandb.run.name}") print(f"wandb.run.url: {wandb.run.url}") wandb.finish() - sys.exit() + sys.exit(0) From 005272b586b08e88619b4c7e341bf7951ae563d3 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 20:10:25 -0500 Subject: [PATCH 212/268] Update `pretrain_gpt_alcf.py` --- pretrain_gpt_alcf.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index fe903e04f7..7ed38614a7 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -8,7 +8,7 @@ import math from functools import partial from megatron import get_args -from megatron import log.info +# from megatron import print_rank_0 from megatron import get_timers from megatron import get_tokenizer from megatron.core import mpu, tensor_parallel @@ -19,12 +19,12 @@ from megatron.utils import get_ltor_masks_and_position_ids from megatron.utils import average_losses_across_data_parallel_group, update_rotary_pos_emb from megatron.arguments import core_transformer_config_from_args -from megatron.utils import ( - report_memory, - throughput_calculator, - checkpoint_throughput_calculator -) -from pathlib import Path +# from megatron.utils import ( +# # report_memory, +# # throughput_calculator, +# # checkpoint_throughput_calculator +# ) +# from pathlib import Path from enrich import get_logger import deepspeed @@ -33,7 +33,7 @@ import subprocess import wandb -import time +# import time from torch import nn import torch.nn.functional as F import ezpz as ez From fdb17075f0a84f0ca61bb76e6bb4d3d37087b649 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 20:58:38 -0500 Subject: [PATCH 213/268] Remove `ds_report` from `train_llama_alcf.sh` --- train_llama_alcf.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index 4aac1153c7..cd2d8213dd 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -132,7 +132,7 @@ run_cmd=" # ds_exec # echo "! Using $(which deepspeed)" -ds_report +# ds_report echo "${run_cmd}" From 936c423355d36102b51c9f885079952e15a856b8 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 21:17:41 -0500 Subject: [PATCH 214/268] Update `.gitignore` --- .gitignore | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/.gitignore b/.gitignore index 3e46cef4c5..edbde3c246 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,33 @@ +# User Added +*tmp* +*core.* +*old* +*.bak +**index-cache** +**pbslogs** +ezpz +*.o17* +*.e17* +*hostfile* +.deepspeed_env +*.DS_Store +old/* +**venv** +*.json +*.o1 +*.e1 +outputs/ +venvs/ +wandb/ +llama-logs/ +checkpoints/ +*.gz +*.txt +*.idx +*.bin +*.log +__pycache__ + .deepspeed_env *.bak .cache/* From a59a532b332b788d26919d94c6470a8593314775 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 24 Apr 2024 21:39:34 -0500 Subject: [PATCH 215/268] Update `ALCF/test_{sunspot,sirius}.sh` --- ALCF/test_sirius.sh | 8 ++------ ALCF/test_sunspot.sh | 5 ++--- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/ALCF/test_sirius.sh b/ALCF/test_sirius.sh index 19cb9405dc..108649612c 100755 --- a/ALCF/test_sirius.sh +++ b/ALCF/test_sirius.sh @@ -37,13 +37,11 @@ setup_megatron_deepspeed() { echo "Running test in: ${OUTDIR}" echo "WORKING DIRECTORY: $(realpath $(pwd .))" if [[ -d "Megatron-DeepSpeed" ]]; then - # rm -rfv Megatron-DeepSpeed/ - echo "Found existing Megatron-DeepSpeed. - Remove existing directory to run test." + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." exit fi git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed - git checkout alcf-tests } @@ -53,8 +51,6 @@ main() { export DEBUG=1 export PBS_O_WORKDIR="$(pwd)" export DATA_FILE_LIST=./ALCF/data-lists/sirius/books.txt - # LR=0.0008 - # GRAD_ACC_STEPS=8 export ZERO_STAGE=1 export NUM_LAYERS=10 export MICRO_BATCH=8 diff --git a/ALCF/test_sunspot.sh b/ALCF/test_sunspot.sh index ca3a59cc87..67f6868d43 100755 --- a/ALCF/test_sunspot.sh +++ b/ALCF/test_sunspot.sh @@ -39,12 +39,11 @@ setup_megatron_deepspeed() { echo "Running test in: ${OUTDIR}" echo "WORKING DIRECTORY: $(realpath $(pwd .))" if [[ -d "Megatron-DeepSpeed" ]]; then - echo "Found existing Megatron-DeepSpeed. - Remove existing directory to run test." + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." exit fi git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed - git checkout alcf-tests } From c9c87d9ff40d013e93dd623ed68cada4086cce9f Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Apr 2024 10:53:29 -0500 Subject: [PATCH 216/268] Update `ALCF/data-lists/polaris/*.txt` --- ALCF/data-lists/polaris/algebraic.txt | 16 + ALCF/data-lists/polaris/arxiv.txt | 100 + ALCF/data-lists/polaris/books.txt | 3 + ALCF/data-lists/polaris/c4.txt | 171 + ALCF/data-lists/polaris/cc.txt | 1108 ++++ .../polaris/data_file_list_books.txt | 3 - ALCF/data-lists/polaris/data_file_list_c4.txt | 86 - ALCF/data-lists/polaris/data_file_list_cc.txt | 2878 ----------- .../polaris/data_file_list_peS2o.txt | 42 - .../polaris/data_file_list_stack.txt | 4435 ----------------- .../polaris/data_file_list_wiki.txt | 2 - .../polaris/dolma_v1_7_file_list.txt | 2419 +++++++++ ALCF/data-lists/polaris/falcon.txt | 501 ++ ALCF/data-lists/polaris/megawiki.txt | 262 + .../polaris/open-web-math-train.txt | 13 + ALCF/data-lists/polaris/pes2o.txt | 26 + ALCF/data-lists/polaris/reddit.txt | 78 + ALCF/data-lists/polaris/stack.txt | 26 + ALCF/data-lists/polaris/starcoder.txt | 50 + ALCF/data-lists/polaris/tulu.txt | 66 + ALCF/data-lists/polaris/wiki.txt | 2 + 21 files changed, 4841 insertions(+), 7446 deletions(-) create mode 100644 ALCF/data-lists/polaris/algebraic.txt create mode 100644 ALCF/data-lists/polaris/arxiv.txt create mode 100644 ALCF/data-lists/polaris/books.txt create mode 100644 ALCF/data-lists/polaris/c4.txt create mode 100644 ALCF/data-lists/polaris/cc.txt delete mode 100644 ALCF/data-lists/polaris/data_file_list_books.txt delete mode 100644 ALCF/data-lists/polaris/data_file_list_c4.txt delete mode 100644 ALCF/data-lists/polaris/data_file_list_cc.txt delete mode 100644 ALCF/data-lists/polaris/data_file_list_peS2o.txt delete mode 100644 ALCF/data-lists/polaris/data_file_list_stack.txt delete mode 100644 ALCF/data-lists/polaris/data_file_list_wiki.txt create mode 100644 ALCF/data-lists/polaris/dolma_v1_7_file_list.txt create mode 100644 ALCF/data-lists/polaris/falcon.txt create mode 100644 ALCF/data-lists/polaris/megawiki.txt create mode 100644 ALCF/data-lists/polaris/open-web-math-train.txt create mode 100644 ALCF/data-lists/polaris/pes2o.txt create mode 100644 ALCF/data-lists/polaris/reddit.txt create mode 100644 ALCF/data-lists/polaris/stack.txt create mode 100644 ALCF/data-lists/polaris/starcoder.txt create mode 100644 ALCF/data-lists/polaris/tulu.txt create mode 100644 ALCF/data-lists/polaris/wiki.txt diff --git a/ALCF/data-lists/polaris/algebraic.txt b/ALCF/data-lists/polaris/algebraic.txt new file mode 100644 index 0000000000..505276d3bf --- /dev/null +++ b/ALCF/data-lists/polaris/algebraic.txt @@ -0,0 +1,16 @@ +0.0018520780893211373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document +0.0017591050606817512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document +0.001459052794333798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document +0.0007405667281569194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document +0.00019420030110896795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document +0.0009008668715801845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document +0.00015115827957143057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document +0.0014552844319220648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document +0.0012469861325685161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document +0.00136412011372413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document +0.0007064279699221103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document +0.0008472240000687427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document +0.0001984375713341955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document +0.0005472773881697123 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document +0.001815779629850992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document +0.0018313600689757324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document diff --git a/ALCF/data-lists/polaris/arxiv.txt b/ALCF/data-lists/polaris/arxiv.txt new file mode 100644 index 0000000000..cae6e2da69 --- /dev/null +++ b/ALCF/data-lists/polaris/arxiv.txt @@ -0,0 +1,100 @@ +0.0002583902668716813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document +0.0002646575141232155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document +0.0003165521247456758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document +0.0002920706460176214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document +0.00028396813182810215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document +0.00030445161883108107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document +0.00031628781276576474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document +0.0003083776568189157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document +0.0003176359471472902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document +0.0002536009369131698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document +0.0003067491424681363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document +0.0002597217257557784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document +0.0003788556450109768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document +0.0002796563272052598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document +0.00033573826524290287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document +0.00030523658022800287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document +0.00032211552192240096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document +0.0003329295675164247 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document +0.0003101982186639862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document +0.00032361798234223355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document +0.0003495541581652915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document +0.0002821637448858042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document +0.00030399523537629673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document +0.0002955658968247219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document +0.00028942158502924254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document +0.00028769546171490733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document +0.0002938111057234182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document +0.0002711150403010948 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document +0.00031130095874747565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document +0.0003002996118160777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document +0.0003732757901604459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document +0.00026784205751795894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document +0.0002799626521661984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document +0.00034334276069078164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document +0.0003582469803674965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document +0.00031094844818418623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document +0.0002766228384977191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document +0.00030297116159471485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document +0.00027033888377464685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document +0.00030090862368377933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document +0.00028543875802490955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document +0.00027559768459074204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document +0.0003182185533962886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document +0.0003311392971435837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document +0.00028751652060804325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document +0.000303466863212589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document +0.00033400462801277524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document +0.0002589234031777426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document +0.0002913508598466723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document +0.0002670572450004856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document +0.00032027399105647656 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document +0.00032188376258379377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document +0.0003161585784100882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document +0.0003184249182974135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document +0.00030381336664000807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document +0.0003190437442184283 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document +0.0002537961798200545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document +0.0003017817117223326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document +0.00028685268513240224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document +0.00031265179094451165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document +0.00034708319096986816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document +0.00026650837943080664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document +0.00034588832248507335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document +0.0002416982248399037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document +0.0003089296918222243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document +0.00029137184185700827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document +0.00026464226846800774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document +0.00030545397919456627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document +0.0003206778460448875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document +0.00030968971641110967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document +0.00023325653928600864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document +0.00030526899198338555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document +0.00035376719076633584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document +0.000290224385981026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document +0.000294650083382008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document +0.00028768858128616436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document +0.00030856965235527843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document +0.00030579942447879054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document +0.0002863101084704357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document +0.0002870032092492213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document +0.000264182727569885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document +0.0002974012367036449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document +0.00032238412143059203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document +0.00031683716893819036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document +0.00031157434937617524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document +0.0003411742735695989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document +0.00026778444816570715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document +0.0003037045797275201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document +0.00027746114370081314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document +0.00027148285946862043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document +0.00028042950114678207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document +0.0003235607816590721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document +0.0003086692227306295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document +0.00033990349455148105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document +0.00030945053208470265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document +0.00027309074552265303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document +0.00028737393506316194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document +0.0003098868328009879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document +0.0002614229162588409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document +0.0002884388407820923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document diff --git a/ALCF/data-lists/polaris/books.txt b/ALCF/data-lists/polaris/books.txt new file mode 100644 index 0000000000..195aca5339 --- /dev/null +++ b/ALCF/data-lists/polaris/books.txt @@ -0,0 +1,3 @@ +0.006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document +0.006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document +0.006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document diff --git a/ALCF/data-lists/polaris/c4.txt b/ALCF/data-lists/polaris/c4.txt new file mode 100644 index 0000000000..833b095882 --- /dev/null +++ b/ALCF/data-lists/polaris/c4.txt @@ -0,0 +1,171 @@ +0.0002406272620255565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document +0.0002404825539493424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document +0.00024062296575435581 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document +0.00024069315766818953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document +0.00024055829162263452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document +0.00024062053397343032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document +0.0002410715545206964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document +0.00024024881846087368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document +0.0002407074700790688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document +0.00024072141428809043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document +0.00024027710230872736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document +0.0002409111299205489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document +0.00024081954058275009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document +0.00024086076794990912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document +0.00024098672620832446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document +0.00024068622303333862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document +0.00024140627024291824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document +0.0002414512033594384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document +0.00024028742594941463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document +0.00024018036089269645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document +0.0002398347365034979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document +0.00024006780153485276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document +0.00024015620270419213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document +0.0002408848259695227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document +0.0002408023185278831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document +0.00024021196580140326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document +0.00024077677271297493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document +0.00024087392454668027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document +0.0002408071293824126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document +0.00024042223828845715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document +0.0002411484752360495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document +0.00023605263746465907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document +0.00023471222158326908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document +0.00023432138580287644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document +0.00023407385623382327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document +0.00023487504174367091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document +0.0002341843704976313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document +0.00023421993170282486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document +0.00023445057969132037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document +0.0002337681680073047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document +0.000234627964808109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document +0.0002338942211888584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document +0.00023403849286843386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document +0.00023405641310796305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document +0.00023349169562397965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document +0.00023381157386048856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document +0.00023388742993790587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document +0.00023363103829469813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document +0.00023421141834630477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document +0.00023420564352232565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document +0.00023367463699173143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document +0.00023344969163567033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document +0.00023372196941547188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document +0.00023399207645297834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document +0.00023357915605505856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document +0.00023337585642190864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document +0.00023385005470157914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document +0.00023301533534493465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document +0.00023377864302541782 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document +0.00023323745848621437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document +0.0002330594611151835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document +0.0002334149675026783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document +0.00023198945902291534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document +0.00023023784834634142 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document +0.00022985623060187217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document +0.0002292605284569516 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document +0.00022926593333048894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document +0.00022922766406807777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document +0.00022898153911167426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document +0.0002292473111593315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document +0.000228804579400424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document +0.00022865485613513526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document +0.00022937426835887895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document +0.00022917388311587372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document +0.0002291660582019043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document +0.00022907895248360543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document +0.0002294617879920205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document +0.0002290452150516566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document +0.00022943405619715553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document +0.0002296271421006204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document +0.00022854791372910372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document +0.00022923123467686557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document +0.00022852404355738494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document +0.00022847798660086642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document +0.0002289604586810316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document +0.00022835479834950643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document +0.0002289149402884243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document +0.00022806655474763446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document +0.00022826296420992974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document +0.00022906829636213627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document +0.0002287628414466998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document +0.0002282673911253445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document +0.00022869309841939134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document +0.0002281540116815451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document +0.0002259755756162738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document +0.00022562331285233504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document +0.0002259061146106053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document +0.00022567670836663787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document +0.00022573165387587061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document +0.00022508514961670572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document +0.00022564642513773356 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document +0.00022563088621998788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document +0.0002250438755373707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document +0.00022524465346241134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document +0.00022531737657666812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document +0.00022444687519363458 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document +0.00022460397498596298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document +0.00022454218976501763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document +0.00022447528843671366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document +0.00022501666332178926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document +0.00022453752304377972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document +0.00022484451871163002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document +0.00022465678847154914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document +0.00022453180917044732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document +0.0002247278486823009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document +0.00022465794828242097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document +0.00022431000701925386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document +0.00022476020248460963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document +0.00022467531771795015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document +0.0002236391309945234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document +0.00022458764920536007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document +0.00022430877426744415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document +0.0002247047786127192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document +0.0002245298090400035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document +0.0002245648831396188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document +0.00022292894729820784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document +0.00022236668082957533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document +0.0002217622659895442 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document +0.00022252452726732609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document +0.00022135333211363678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document +0.0002214571757787971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document +0.0002217188139237798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document +0.00022144214894640303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document +0.00022100172806631854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document +0.00022156392409199052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document +0.00022134830143710272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document +0.00022158598922529453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document +0.00022142932483041377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document +0.00022120980907786554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document +0.00022117917738112441 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document +0.00022077089397851235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document +0.00022093265074996711 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document +0.00022091299741377004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document +0.0002205849150703338 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document +0.0002210648204787979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document +0.0002214235747364102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document +0.00022083907302221787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document +0.0002206334237915964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document +0.00022065193929912214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document +0.00022079775597767288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document +0.00022091492909963518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document +0.00022095009987097293 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document +0.0002208150577180165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document +0.00022085759102772088 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document +0.00022073789170129016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document +0.00022049322781182384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document +0.00022083270617761285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document +0.00021982452827473632 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document +0.00021899870446514259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document +0.00021890358773356361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document +0.00021875556609042841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document +0.00021861195987201226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document +0.00021856782186167455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document +0.00021912837771543515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document +0.00021900213768517756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document +0.00021871675851390374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document +0.0002180537056545586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document +0.0002188196714327129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document +0.00021851362624523464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document +0.0002183236795498736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document +7.291153618675672e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document diff --git a/ALCF/data-lists/polaris/cc.txt b/ALCF/data-lists/polaris/cc.txt new file mode 100644 index 0000000000..edf6aab8c1 --- /dev/null +++ b/ALCF/data-lists/polaris/cc.txt @@ -0,0 +1,1108 @@ +0.0003742481815405742 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document +0.00038204855962733055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document +0.00038821818392663593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document +0.00038723332988783727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document +0.00038916141142149904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document +0.00038049542523949033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document +0.0003854755539534284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document +0.00024202756466512517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document +0.0003915405155008087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document +0.0003927382151931033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document +0.0003839151202260479 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document +0.00040006817468967907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document +0.00040318965964443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document +0.0003831013019452741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document +0.00039166638383204036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document +0.00039962784023961004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document +0.00039536707853602614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document +0.0004204304698247758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document +0.00041538899178693555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document +0.00039186953333675306 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document +0.00038945837196504305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document +0.0003919951238929062 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document +0.00044377065718528966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document +0.0004407759068603017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document +0.0002487811895843715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document +0.00039349432045556636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document +0.00041223198559462343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document +0.0004036573014830213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document +0.0003825982215521807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document +0.00040386867133151386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document +0.00024460575279105167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document +0.000269029789531335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document +0.0003573757493252864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document +0.0004600876681392076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document +0.0002605354166397086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document +0.0003882502452157999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document +0.0002466747612126512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document +0.0004024726105072402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document +0.00040820631128483644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document +0.0002691094350403538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document +0.00026916830387277267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document +0.0004204663297880574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document +0.00042379698687085554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document +0.0004502169227311871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document +0.0002661708937015295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document +0.00031239486948031334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document +0.0003109054589936201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document +0.00045873053079760646 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document +0.00022904931423244635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document +0.0003813462028433663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document +0.00039188129256500874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document +0.00045124222276983765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document +0.00048138658436853695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document +0.0003944178776279866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document +0.00039941569676754006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document +0.00037952761190240494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document +0.0003944870860881476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document +0.0003891842411856621 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document +0.000387688981934861 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document +0.00039197953876258005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document +0.00039007915280311206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document +0.0003995520363699188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document +0.00039230985654592406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document +0.0003929472067173851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document +0.0003924096172671473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document +0.0003881636143629905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document +0.000389790617937084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document +0.00037351762309221023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document +0.0003630196170929407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document +0.00033532465765142113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document +0.0003076088685761823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document +0.00039463850897720803 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document +0.0002843816115231449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document +0.0002909175709416474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document +0.00028867170997202486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document +0.0002838644617723659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document +0.00029027869525543416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document +0.0002821339567560056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document +0.0002922988877045601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document +0.0002866955958315786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document +0.0002865271754558126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document +0.0002861247475618473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document +0.0002826681072408606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document +0.0002849746458282827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document +0.0002816966633435316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document +0.00026255342235948463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document +0.0002552895098829678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document +0.00025990194083107813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document +0.0002524062657685835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document +0.0002538577379748611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document +0.0002561415177406761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document +0.00026206253059694905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document +0.00026168095406910565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document +0.0002601305742008613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document +0.00025200823006814814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document +0.0003229951981263502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document +0.00037289448266476045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document +0.0003807825862179898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document +0.0003616333738191483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document +0.0003665117918907636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document +0.0003684186453633228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document +0.0003589330610806066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document +0.00036383861418030395 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document +0.000359841363355303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document +0.00036431044063050464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document +0.0003668574090358279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document +0.000362768263620199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document +0.0003501888032771077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document +0.000352401968221528 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document +0.0003541019701869794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document +0.0003628121865546891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document +0.0003752582953758773 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document +0.00037902046230424966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document +0.0003777927146925147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document +0.0003760676130509053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document +0.00034046049078755405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document +0.0003338847563259091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document +0.00033294499102761794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document +0.0004912026198265864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document +0.00032064363474664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document +0.00032154190389541214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document +0.00032309660151746207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document +0.00031181143365304544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document +0.00031046092294569104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document +0.00031150165249068046 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document +0.0003041314265988224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document +0.0003024834909739394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document +0.0003019936835833604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document +0.000292329665283177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document +0.0002867061143144972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document +0.00028443615610701707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document +0.00028462291013755945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document +0.0002793538601205013 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document +0.00027306573977044246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document +0.00027097155673336525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document +0.0002752934202112985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document +0.00043042012694697647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document +0.00047495648822986177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document +0.00047755032493473855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document +0.0004706974343933747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document +0.00046682163297771817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document +0.0004616765425874178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document +0.00030644496751628097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document +0.0002909492555358308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document +0.00027272036068261724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document +0.0004101070217315588 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document +0.0003728914338834357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document +0.00036546911442305647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document +0.0003669945482407483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document +0.0003715902407424017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document +0.00035837486406683366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document +0.0003573318538685469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document +0.0003553784893071916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document +0.0004920659809912352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document +0.0004533619411303183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document +0.00045067066057818706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document +0.00044396985139270645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document +0.00043198288204468477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document +0.00043005174223738454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document +0.00041847118430776784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document +0.00042952036375796664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document +0.00043420594647324267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document +0.0003461123241053012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document +0.0003408581597849182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document +0.00033172705422182547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document +0.0003392566490686136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document +0.00033578341518385483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document +0.0003439196710518844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document +0.00034559163447085543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document +0.00033762478642902825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document +0.00033215210055107224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document +0.00033423579608014966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document +0.0004963355016025102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document +0.0004996862761456923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document +0.0005000551829325451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document +0.0005004212610098755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document +0.00027768695585500585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document +0.00028395983854338433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document +0.00027835826303062254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document +0.0002740073176010804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document +0.0002791830529274016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document +0.0002796863816194411 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document +0.00026697453022672804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document +0.0002594197440280141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document +0.0003779565697649222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document +0.00041835823476586606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document +0.00043788493575265915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document +0.0002731731970096006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document +0.000276305847423402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document +0.0002704955773958623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document +0.0002629635944827518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document +0.000260070956974436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document +0.00025661553791456334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document +0.00025794727207576157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document +0.00025295733980001527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document +0.0003788106407021029 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document +0.0004882344027669431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document +0.0003275324309642705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document +0.0004803401856640094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document +0.00046720138323433943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document +0.00043527810307095335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document +0.00043905395741627827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document +0.00048774175867331425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document +0.00048380704121346737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document +0.0004779011848346118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document +0.00046255587581908036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document +0.00045127922880511576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document +0.0004503891485256095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document +0.0004450142332303422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document +0.00044630282482516654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document +0.00044325014465743616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document +0.0004263874842796447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document +0.0004217530913646938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document +0.000415120314341852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document +0.00040987168279144537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document +0.00033468337266607834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document +0.0003353094464683005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document +0.0004833936821707294 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document +0.00047194878988920935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document +0.0004648324126996427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document +0.0004562345003964941 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document +0.0004933203505465098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document +0.0003530166075325466 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document +0.00035368548192804685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document +0.0004872620828289663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document +0.00048293889392426456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document +0.00047936768462267655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document +0.00047821013991587545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document +0.0004660610308564753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document +0.000394683430103437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document +0.00039165053441571324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document +0.0003906936040164381 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document +0.00038074803919159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document +0.0003686529291578143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document +0.00035832920428870976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document +0.00035929024535947033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document +0.0003538226556050544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document +0.0003584167868708799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document +0.0003480507542594234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document +0.0003413709023543034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document +0.00034001304759361455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document +0.00033430532902756514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document +0.00046519252660631277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document +0.0002938876402514769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document +0.00028676090994509047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document +0.00027296150117506716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document +0.00026513502621960483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document +0.0002680081327926125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document +0.00025831225828720344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document +0.00026647037295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document +0.0002525733734572654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document +0.00025831708887575375 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document +0.00042487627444443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document +0.0004951213245023891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document +0.0004804051413177752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document +0.0004662397611340532 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document +0.0004550138655253933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document +0.00044494909122746795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document +0.0002899112253051385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document +0.0004372879736279761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document +0.0004529568099252922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document +0.00045127826158829573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document +0.0004436558176737439 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document +0.0004419233237678378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document +0.000434589215880319 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document +0.00029153613207706566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document +0.0004312458058738854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document +0.00028741854968757313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document +0.00046853200754421234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document +0.0004949145252030074 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document +0.00044459683920483167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document +0.0003836095306696336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document +0.0003789760237872398 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document +0.0003749227438304427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document +0.0003628558277173369 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document +0.00039468301394041474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document +0.00038874701821614864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document +0.0004158492456077867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document +0.00042360504554060077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document +0.00040386729844317623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document +0.00027595096702902474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document +0.00043638766787829135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document +0.0002218691596850179 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document +0.0004437566108089954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document +0.0003889996411609667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document +0.00043454421906537704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document +0.0004522564392830988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document +0.00041517835659357416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document +0.0002614360863446896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document +0.00037543522111463596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document +0.0004386190133514781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document +0.00046358333286115075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document +0.00043186261317942404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document +0.0002377581602097957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document +0.00025973334085074254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document +0.00040139099332000796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document +0.00043674860686687174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document +0.00040853289309329373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document +0.000242910191729688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document +0.0004431071731750582 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document +0.0004388092670482523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document +0.000381418866255965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document +0.0004100117296419717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document +0.00042469230366022745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document +0.00041744151905374254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document +0.00022835699906752945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document +0.0004380161085387397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document +0.00044803212381807456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document +0.00040554932796137236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document +0.0004234508646347761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document +0.00043341209652360653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document +0.00023966604734537185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document +0.000259165907316014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document +0.0004270653021833602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document +0.0004341547032162028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document +0.0004111478117275994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document +0.0004299383567984396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document +0.0004241899124590779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document +0.0004502719349364145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document +0.00038994621469645615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document +0.0003859912398894952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document +0.0004247535950310557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document +0.000386982084327716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document +0.0004196451040053251 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document +0.0004096278509782259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document +0.0004373334932695721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document +0.0004180889975240641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document +0.00042079636929672745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document +0.00038063574611812913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document +0.0003817505891515542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document +0.0004420096268860222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document +0.00039182670726410623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document +0.0003635667850372299 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document +0.00041564996472055667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document +0.000400529358757286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document +0.0003939113874958451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document +0.00039066622068940996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document +0.0004290098538807143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document +0.0004240739958197099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document +0.00040775392659215333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document +0.0004091634200396925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document +0.00042299190476617914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document +0.0003701492680344151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document +0.0003807353844384635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document +0.00038813507771983156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document +0.00040072346558408346 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document +0.0003603595180423597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document +0.00038799421353112465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document +0.00037575235582264926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document +0.0004239190342959713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document +0.0004606044799136546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document +0.00045107950652529253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document +0.0004391947201871058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document +0.0004457516661123035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document +0.0004301297170991686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document +0.00044661704164586694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document +0.0004438849846114837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document +0.0004444205734316823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document +0.0004190924165303394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document +0.00043942581131677875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document +0.00021568459798090663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document +0.0003814929225407199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document +0.0003217453179359235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document +0.00031719591470267974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document +0.00032434115726922137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document +0.0004079911120371051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document +0.000329492766381148 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document +0.0003845916162001633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document +0.0003835208964390098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document +0.00037847334157173194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document +0.00038296039903791865 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document +0.00037896336828472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document +0.00037620974396391355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document +0.00037420590727111843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document +0.000340490625886403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document +0.0003078314411035827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document +0.00034153990750656097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document +0.0003308858103982067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document +0.0003452640607156025 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document +0.00033095276418403455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document +0.0003116308995860414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document +0.00032446713226408477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document +0.0003015816821912984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document +0.00031612418775706894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document +0.0003278516344971041 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document +0.00033079446736097217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document +0.00032278977146550837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document +0.00032065272988207914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document +0.0003936696452406576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document +0.0003450109536627789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document +0.0003339787189919641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document +0.0003284303856176974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document +0.00033652677276843477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document +0.0003257822443845694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document +0.0003293985569149334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document +0.0003310360260148262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document +0.0003233770986418526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document +0.0003172280092149422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document +0.0003160674744292835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document +0.00030931090289598506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document +0.0003093173886443107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document +0.00033167847081104083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document +0.00031131501311729723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document +0.00031046608876279845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document +0.00030569235942207244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document +0.00030777943671285197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document +0.00029303314290956683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document +0.0003045824546400205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document +0.00030360880677729793 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document +0.00031646239964835433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document +0.0003129122300603785 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document +0.00031060464956661433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document +0.000311819032500067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document +0.0002977872483902282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document +0.0003009448600922438 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document +0.00028610292098537774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document +0.0002988326876216654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document +0.00028550828372819075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document +0.0002830381750875739 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document +0.0002848495855927156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document +0.0002856443760308144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document +0.00027442895344188584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document +0.0002681160554049462 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document +0.0003421482544126989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document +0.0004005872948449718 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document +0.0003930123959320308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document +0.0003867271832275778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document +0.000380805140455254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document +0.0003814769861947819 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document +0.00038025170883282324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document +0.0003738026647867475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document +0.00018960856915036276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document +0.0003697177501953134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document +0.00036674194328136693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document +0.00036447406838697555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document +0.00036686410861101255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document +0.00035915267825103423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document +0.0003624758404026675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document +0.0002822812140180794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document +0.00030620512946920813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document +0.000294249776520589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document +0.00030238536967523434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document +0.00029509593361580754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document +0.0002906912701830899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document +0.0002921944165474959 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document +0.00028358919691127954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document +0.0002813182772323272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document +0.00027442640800299205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document +0.0002747820342933984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document +0.0002747584403979717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document +0.00027499129634862444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document +0.0002712050404257197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document +0.0002616256943143254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document +0.00026769938929002815 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document +0.00038396081322727017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document +0.0003863140490027991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document +0.00037702277513203237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document +0.0003633274156107032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document +0.0003587473889240435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document +0.0003507672084278415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document +0.00033776425499780385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document +0.0003377914127574796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document +0.00032948015659161326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document +0.00033245638541392985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document +0.00031080707640648695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document +0.0002976903331149755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document +0.0002965121463725523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document +0.0002933849695266647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document +0.0002837035078508233 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document +0.00028684569079589323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document +0.0003145192320802359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document +0.0003566937253273515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document +0.0003470199109592918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document +0.0003060245312041868 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document +0.0002650817213818789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document +0.0002643604938780134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document +0.000299350876031416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document +0.0003178540797697938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document +0.000271850367887767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document +0.00031349896596549 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document +0.00031749734412765755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document +0.0003791137842391209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document +0.0003742334169957992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document +0.0003705639757351107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document +0.0003126986769797042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document +0.00031038132814561196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document +0.00036464437173804883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document +0.0003569480488951322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document +0.0003541239221619106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document +0.00035315297411308053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document +0.0003572451925404141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document +0.0003514986129411253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document +0.0003521798298425866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document +0.00034553677439244716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document +0.000349004719809412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document +0.0003468247484872769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document +0.0003465822608356558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document +0.00035410983132162007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document +0.0003487908354969444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document +0.0003479024763238147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document +0.000341412530646823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document +0.00034451316273667034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document +0.0002618849993484869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document +0.00026788679978901144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document +0.00027450670773227214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document +0.0002661273129899329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document +0.00026836569676402957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document +0.00026155876975483236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document +0.0002609276830117151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document +0.0002644161630512771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document +0.00036789208972872557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document +0.00037829849439990513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document +0.0003788894943523098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document +0.0003617207777959397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document +0.0002541334487248998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document +0.0002707945538071073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document +0.00027046282716455214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document +0.0002652443167243215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document +0.0002685859923850986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document +0.00025734961751176414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document +0.000259041720872915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document +0.00025340107274823446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document +0.00025757135121837893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document +0.00025617700500574084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document +0.0002566931670562857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document +0.0002543871190716101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document +0.00024997565589481713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document +0.0002954079779456287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document +0.00034890741135252835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document +0.0003473298137731525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document +0.0003296959618486435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document +0.0003304520061604598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document +0.00032377956175729824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document +0.00031700696295168713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document +0.0003060382346081943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document +0.0003012003005056863 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document +0.0002981074073993884 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document +0.0002922128825950705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document +0.000348901087722931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document +0.0003408286289467841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document +0.0003410649680770183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document +0.0003358524215576502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document +0.0003343661874989231 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document +0.00032810573699389156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document +0.00032261449539097497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document +0.0003162694866049203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document +0.0003158381156468853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document +0.000317376061083603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document +0.0003125788639953052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document +0.0003010105041885602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document +0.0003065865059090678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document +0.0003084275726508053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document +0.00030966560718296085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document +0.0002957728057853081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document +0.00029904164542325336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document +0.0002955358888729187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document +0.00028692976446931544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document +0.0002923476214935797 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document +0.0002893691697212419 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document +0.0002855895211981585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document +0.00027968347097626246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document +0.0002810783462604979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document +0.00027794080455729715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document +0.00034784376461416953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document +0.0003488347959010943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document +0.00034790583710250724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document +0.000345913166618151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document +0.00033801936268066675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document +0.0003290591130212315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document +0.00034051399521366823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document +0.00032470943131841784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document +0.00031679540050914276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document +0.00031814596342422325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document +0.0003156466289485036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document +0.00029985010879003633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document +0.0002905176377776361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document +0.0004206836775460856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document +0.00020660449162246918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document +0.0003461727254468087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document +0.00020592870907067763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document +0.00034173505299233005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document +0.0004052437256652738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document +0.0004080650901351697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document +0.00039778184149144276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document +0.00039046311464950275 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document +0.00039043444911071384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document +0.000388575704932843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document +0.00019737533145666597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document +0.00037610755595812403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document +0.00037315400127598317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document +0.00037415028580922163 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document +0.00036694041707212337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document +0.00018947219857306515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document +0.00037046050826533545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document +0.0003587440768559087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document +0.00034623936498708903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document +0.0003502289592617922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document +0.00034692398063649823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document +0.000339340809421849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document +0.0003360510394816983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document +0.0003354673850814145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document +0.00032937682875877047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document +0.00032844505049317715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document +0.00028287199339908627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document +0.0002795217197003578 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document +0.00028048955601883463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document +0.0002769326396439027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document +0.0002727090021299243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document +0.0002726577841024554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document +0.00026663619593455374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document +0.00026068042672138127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document +0.0002637704114326801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document +0.0002593043567100412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document +0.0002599897110113453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document +0.0002435078682758859 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document +0.0002450530071379054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document +0.00024233331983743606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document +0.0002934750947999535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document +0.00033241226364044474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document +0.00032938406090272075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document +0.00032778705403953246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document +0.00032184551480398754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document +0.00031874002264945737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document +0.0003165319685666433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document +0.00031307071173376295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document +0.00031119524184911957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document +0.0003102253344576429 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document +0.0003088976240383192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document +0.0002951410823077708 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document +0.00029772657676757413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document +0.0003056048989909935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document +0.00031991305381648026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document +0.00030890256978362426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document +0.0003109382904091933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document +0.00031035798529690644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document +0.00030741666395911753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document +0.0002989918594861846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document +0.00029569635443989434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document +0.0002973992445667285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document +0.000293397351001072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document +0.00028737817438047954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document +0.00028252738144009747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document +0.0002805511898623541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document +0.0003718020784620472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document +0.0003499713845765235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document +0.00034283547445326676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document +0.00031464759888838765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document +0.00033188946446414833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document +0.000326084432195463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document +0.0003764568303917893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document +0.0003604955598858414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document +0.0003655654554133222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document +0.00035762304033750504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document +0.00038478883950347103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document +0.00027735714341247454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document +0.00028139534607773563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document +0.00019777292251713763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document +0.000285571704874486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document +0.00028543482146244363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document +0.00019434234484256758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document +0.00027854908176986763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document +0.0002847068039566143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document +0.00028672356943064853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document +0.00027782687605808177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document +0.0002843539634105203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document +0.0002894748379090401 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document +0.0002868852440186493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document +0.0002818504885373851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document +0.00028680112812941034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document +0.00019258978168723977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document +0.00028760637934715155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document +0.0002820439443912918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document +0.0002831001054410018 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document +0.00029001901552467397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document +0.00027779449377883156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document +0.00019949837437516796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document +0.0002907306472984446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document +0.00027814858381318327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document +0.00019472790889161432 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document +0.00020472626596924125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document +0.0002870045081974301 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document +0.00019812241927078482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document +0.0002817553333369554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document +0.00027829782796642117 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document +0.00028289431732284113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document +0.0002795526296717729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document +0.00027682829988044574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document +0.0002895432402719184 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document +0.0002823174903941811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document +0.00028170972351837796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document +0.00027807915877838826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document +0.00028588515681452956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document +0.00028112324090816726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document +0.00020636178289985485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document +0.00019447255290980535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document +0.0002850824220591452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document +0.00027856429520116784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document +0.0002820880676635633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document +0.00028943902215995714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document +0.0002676366291085329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document +0.00023806333809954687 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document +0.00024526460430233455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document +0.00023876876664622726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document +0.00023379770334179805 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document +0.00024175151269138382 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document +0.00023386583242595706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document +0.00023771797150160827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document +0.0002262748967483896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document +0.0002408148346432682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document +0.00023398651720444235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document +0.00022989433874474592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document +0.00023948500543957772 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document +0.0002331594076859196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document +0.00023375132439600242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document +0.00023923410909668642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document +0.00023952796315562954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document +0.0002327466076905069 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document +0.00023082758956797212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document +0.0002240509275524448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document +0.00022798879995765268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document +0.000221172516774386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document +0.00021767045123534623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document +0.00021982832794804484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document +0.00021971626543789102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document +0.00022566565206920132 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document +0.0002181984894194856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document +0.00021831417549554653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document +0.00021601405421187145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document +0.00022275733725519607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document +0.00021847734911973986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document +0.0002243591012664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document +0.00021688758139483833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document +0.0002182953624789215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document +0.00020475155724026002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document +0.00021498078062960065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document +0.0002157914337233064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document +0.00021781838494967963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document +0.00021723242266814558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document +0.0002176782686553837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document +0.0003486179404943968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document +0.00034882846352857634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document +0.00031400868448352596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document +0.00030273484020011963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document +0.00029895889118145404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document +0.00029770764609621714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document +0.0002990181332116852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document +0.00029653733972285996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document +0.00029624649222942476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document +0.00029625609720203576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document +0.00029731928930852147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document +0.00029011721326148513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document +0.00028849788197494655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document +0.00021601278623858145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document +0.00021319599281739178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document +0.0002153325290600083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document +0.00018566946174516558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document +0.00020736824394291617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document +0.00020857419820128004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document +0.00020058526129536423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document +0.00020745812166665217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document +0.00020652171015271702 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document +0.00020643808911278608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document +0.00020040513914482103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document +0.00020598050188272898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document +0.0001969184139343296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document +0.0001972748812937012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document +0.0002038556751586195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document +0.00020245186011313464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document +0.00019950381422038783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document +0.00020837055459665258 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document +0.00020371856218246096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document +0.00019537612301625791 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document +0.00019914984508813857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document +0.0002053787713691309 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document +0.00019082100541008637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document +0.00020397153334531813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document +0.0002021462693077317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document +0.00019609357008124035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document +0.00019693256622486236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document +0.00020007239732428112 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document +0.00020467075741591954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document +0.00019584883400022932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document +0.00019135050391176972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document +0.0003362829834208298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document +0.00034013691154784095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document +0.00033215887031941976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document +0.00032681189065396707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document +0.0003149138485493094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document +0.00030179177307540077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document +0.0002923278437581119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document +0.00029470052278994486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document +0.0002994095093045731 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document +0.00029033525096085037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document +0.00029390798852496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document +0.0002916230924130842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document +0.00029419886374594913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document +0.0002865469756730764 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document +0.00021191292549942086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document +0.00021369664817409847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document +0.00021612485624266726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document +0.00022242192634588478 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document +0.00014605095659989698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document +0.00022070626106341693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document +0.0002174420774054071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document +0.00021325858963116995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document +0.0002124322999488052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document +0.0002081218896969054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document +0.0002108710211556957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document +0.00020686867095978426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document +0.00020895752681041895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document +0.00020741922266415738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document +0.0002069112657197308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document +0.00020644627473468118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document +0.00020332991338121604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document +0.0003560895677789848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document +0.00032915779111908214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document +0.00033810613317040864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document +0.00033729626594036923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document +0.00033550342864602944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document +0.00034173474024556906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document +0.000331505340748827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document +0.0003270050330117195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document +0.00032585275329172556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document +0.0003143383203190604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document +0.00031655199110388894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document +0.00030738872158476413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document +0.00030838388352699285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document +0.0003053596995351888 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document +0.00031836304739584593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document +0.000315315435873905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document +0.0003087116248965243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document +0.00030396790625537645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document +0.0003335812246032149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document +0.00034570956323095843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document +0.00034563035636675786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document +0.00033411265479076335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document +0.00034439191141692787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document +0.0003364483125496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document +0.0003299500453608033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document +0.00033163377700074837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document +0.00032638649660627673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document +0.00032616167939645234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document +0.0003205289298760723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document +0.00031939393740815355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document +0.00031593164066731296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document +0.00031928871111254405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document +0.00029670189073175004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document +0.00020517703846735904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document +0.00020128418186172073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document +0.00019662723895606717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document +0.0001981157042081407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document +0.00019703489037041608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document +0.00019079796331785068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document +0.0001909352306690079 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document +0.00018824662295261396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document +0.00019864275319325954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document +0.00018818516521649587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document +0.00018875694972812844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document +0.00018231621170645482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document +0.00018349407845798273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document +0.00018088971427746906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document +0.00018296284236327237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document +0.0001876011825819916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document +0.000329052068725176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document +0.00032223616273648536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document +0.00031272564089633955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document +0.00031621609908414494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document +0.0003117213560911235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document +0.00030218064069945934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document +0.00030658916600512085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document +0.0002915863534115821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document +0.0002940280138374372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document +0.00029067860468866085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document +0.00028529228063135635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document +0.00028336893301452256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document +0.0002794668089130099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document +0.00021681361378827842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document +0.0001484664674497246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document +0.00021950558378215133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document +0.00021806860758808645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document +0.00021819568718852282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document +0.00021626925931585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document +0.0001464536143077762 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document +0.00021432777088808917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document +0.000213473805865147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document +0.00021397067253964538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document +0.00020758957647437263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document +0.00020687124337683314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document +0.00020630057046511005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document +0.0002091166859352538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document +0.00020777355025615267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document +0.00020709287641496176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document +0.00020736464660577094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document +0.00020062246741862607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document +0.00020693207561942915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document +0.00021151004871893024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document +0.00019930249098689716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document +0.00021589710041231824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document +0.00021369204789905741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document +0.0002147099923936778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document +0.00021077531190389536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document +0.0002100509829113836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document +0.00021185362601571124 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document +0.00020722136637339565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document +0.00020300093701169531 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document +0.00019859737993313477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document +0.00019971314372100164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document +0.00019549908270269278 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document +0.00019649820843534028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document +0.00019619415513498067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document +0.00019493006120377898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document +0.00019499409035775506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document +0.00019252988593634277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document +0.00019440768268686405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document +0.00018747161324755577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document +0.0001879575932372779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document +0.00019040707058357506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document +0.0001871931095090703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document +0.00020112966223017096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document +0.00020516878165311017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document +0.00020664735191740533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document +0.00021041398572882962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document +0.00020397992929690396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document +0.0002039978580295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document +0.00020592785601142126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document +0.0001990755527445265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document +0.00019729564847798732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document +0.00019958182230527032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document +0.0001985037302636386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document +0.00020204130355115716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document +0.0002000296401958085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document +0.0001983064832295463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document +0.00019663108484195617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document +0.00019510678560556523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document +0.0001873284057063206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document +0.00019311553072495885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document +0.00034652137288816547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document +0.0002813690318850024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document +0.00027697649713138685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document +0.0002755419092534421 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document +0.0002681583054440219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document +0.00026945753192750824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document +0.00026169470768245737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document +0.00026437008960810825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document +0.0002637294838228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document +0.00026491867965088836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document +0.00025504483625138986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document +0.0002545040623796586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document +0.0002546682814073622 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document +0.00025545439487142615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document +0.0002626896557978271 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document +0.00025092040940402784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document +0.0002589154885863872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document +0.00024106160482721467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document +0.0002483289690087987 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document +0.0002388930282784437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document +0.00024006340759273874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document +0.00023765248178029045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document +0.00023061351965578936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document +0.00024954224883546477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document +0.00017861017233018525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document +0.00017810832743667658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document +0.00017599709170759497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document +0.00017462723516505223 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document +0.0002906316527068669 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document +0.00033762141066247166 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document +0.00017170670574152494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document +0.00017258674515137717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document +0.0002815386173173926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document +0.0002996845935618989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document +0.0002735268488987296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document +0.0002971738713071517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document +0.0002942690674002763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document +0.0003322222207729567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document +0.0003378721656198464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document +0.00018307262621851067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document +0.00033956081502775057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document +0.00031604820927876276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document +0.00028805657681088917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document +0.00026312293321215633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document +0.00034366936722921455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document +0.0002865256504406559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document +0.0003063615195861786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document +0.00028412791619666136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document +0.00028060835132727154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document +0.00032544974761560506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document +0.0002647177833217225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document +0.0003152621884896575 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document +0.0003054625140336913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document +0.00031183308312292263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document +0.00018175026696621178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document +0.00017699918328872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document +0.00018222339261441908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document +0.00018348005930964137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document +0.0001810735993810541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document +0.00030846441282038914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document +0.0002972326889310354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document +0.00017433421318235594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document +0.00032799458649525895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document +0.00032482130048512673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document +0.00031943465668672475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document +0.00029615593630484517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document +0.0002893126939511001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document +0.0002849288351723284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document +0.00028383906633569267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document +0.00028072526091262615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document +0.000284239564292377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document +0.0002778903109432523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document +0.0002771644389501471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document +0.0002733316182319337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document +0.00026362539185869363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document +0.0002636325383220217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document +0.00026740622442302886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document +0.0002646771971853427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document +0.0002628566720605389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document +0.0002644760695434766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document +0.0002623837702310999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document +0.00026088722976772894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document +0.0002567065374799158 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document +0.00018857382101207726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document +0.00019036580399817203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document +0.00018348828065261222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document +0.00018491851780345073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document +0.00018904887260080187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document +0.0001875609304251801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document +0.00018393034720015817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document +0.00018419795526114903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document +0.00018699955623404795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document +0.00018276256902965128 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document +0.00017698045695190812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document +0.00018104650132303642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document +0.00017758206731279688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document +0.00017131402995103497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document +0.000175944428350446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document +0.0003416745727147391 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document +0.0003163259373952889 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document +0.0002804489269172448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document +0.00028748272397403175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document +0.00027603318345630605 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document +0.000271638824679648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document +0.0002763761210210942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document +0.00026501984873172717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document +0.00026422486894694714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document +0.0002686339100849262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document +0.0002610837453940606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document +0.000260974343729353 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document +0.0002599403837029134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document +0.0002937273113238609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document +0.0003341790732600504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document +0.0002620661576600244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document +0.0003027929169239288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document +0.00031944039129326894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document +0.00019025676304139009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document +0.00018680910145009907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document +0.00034215840419416437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document +0.00018618120812119364 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document +0.00018605853095599425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document +0.00018120712626096538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document +0.00018315079292495327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document +0.00018362556449041974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document +0.0001780024456718171 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document +0.00033296526436178697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document +0.0001802398632282846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document +0.00017340263100798256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document +0.00017755840547238697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document +0.00018419413735260606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document +0.00017869518174591322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document +0.00017526271460129484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document +0.00017852168597981907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document +0.00017566536156787157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document +0.00017589867964432936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document +0.00017831487394075305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document +0.00017837310528935862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document +0.00018200908814216548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document +0.0001795136627511612 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document +0.0003414021775300033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document +0.00017177291787788502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document +0.0003441900648571877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document +0.0003394534597060673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document +0.0003236887233114832 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document +0.0001639544129688747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document +0.00019137443753211255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document +0.00018575146284680153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document +0.00019184792863440243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document +0.00018966043065679055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document +0.00017968851317035848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document +0.00018479881897661546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document +0.0001813642692683015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document +0.0001686449798983066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document +0.00018516104592230446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document +0.00031283726601066385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document +0.0003248607542883853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document +0.00031583241601202365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document +0.00031238270857730376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document +0.000307150592403979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document +0.00029443829986847044 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document +0.0002942723732234677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document +0.00023514930666443422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document +0.0020776328951453444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document +0.0021768234410538883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document +0.002106973549276289 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document +0.002110915756171751 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document +0.0017032382109816464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document +0.0019047944877712286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document +0.0019402711744016077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document +0.0006264790011223686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document +0.0017885401938106643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document + diff --git a/ALCF/data-lists/polaris/data_file_list_books.txt b/ALCF/data-lists/polaris/data_file_list_books.txt deleted file mode 100644 index 18109f946d..0000000000 --- a/ALCF/data-lists/polaris/data_file_list_books.txt +++ /dev/null @@ -1,3 +0,0 @@ -0.006 /eagle/datasets//dolma/data_Llama2Tokenizer/gutenberg-books//books-0001_text_document -0.006 /eagle/datasets//dolma/data_Llama2Tokenizer/gutenberg-books//books-0000_text_document -0.006 /eagle/datasets//dolma/data_Llama2Tokenizer/gutenberg-books//books-0002_text_document diff --git a/ALCF/data-lists/polaris/data_file_list_c4.txt b/ALCF/data-lists/polaris/data_file_list_c4.txt deleted file mode 100644 index 9ff6f90ff9..0000000000 --- a/ALCF/data-lists/polaris/data_file_list_c4.txt +++ /dev/null @@ -1,86 +0,0 @@ -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0012_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0001_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0073_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0045_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0084_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0065_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0032_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0085_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0064_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0025_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0042_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0055_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0023_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0028_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0036_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0037_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0016_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0000_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0019_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0046_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0059_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0017_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0072_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0033_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0006_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0061_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0071_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0057_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0011_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0047_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0004_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0009_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0070_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0018_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0054_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0049_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0003_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0021_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0051_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0074_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0027_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0050_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0079_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0022_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0030_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0034_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0020_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0035_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0015_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0066_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0044_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0010_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0002_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0041_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0067_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0048_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0013_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0083_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0053_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0008_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0014_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0069_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0056_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0062_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0031_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0007_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0077_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0058_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0076_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0078_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0005_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0081_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0040_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0068_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0075_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0063_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0029_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0039_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0026_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0052_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0024_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0043_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0060_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0082_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0080_text_document -0.0875 /eagle/datasets//dolma/data_Llama2Tokenizer/c4//c4-0038_text_document diff --git a/ALCF/data-lists/polaris/data_file_list_cc.txt b/ALCF/data-lists/polaris/data_file_list_cc.txt deleted file mode 100644 index 4a20a99b89..0000000000 --- a/ALCF/data-lists/polaris/data_file_list_cc.txt +++ /dev/null @@ -1,2878 +0,0 @@ -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0553_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0299_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0366_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0753_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0429_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0372_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0124_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0437_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0053_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0615_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0182_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0713_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0688_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0166_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0768_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0692_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0041_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0416_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0630_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0639_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0225_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0035_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0365_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0368_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0196_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0328_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0624_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0081_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0488_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0189_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0118_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0150_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0314_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0209_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0229_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0265_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0532_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0478_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0140_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0256_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0047_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0607_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0023_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0111_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0613_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0748_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0000_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0127_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0106_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0563_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0577_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0502_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0705_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0538_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0088_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0263_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0460_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0571_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0653_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0172_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0524_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0652_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0322_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0447_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0387_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0612_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0290_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0339_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0487_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0396_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0178_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0091_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0193_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0408_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0496_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0755_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0773_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0547_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0384_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0574_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0533_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0464_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0489_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0050_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0060_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0114_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0033_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0561_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0208_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0233_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0744_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0326_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0313_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0482_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0436_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0588_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0080_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0660_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0038_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0282_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0745_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0406_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0116_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0059_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0503_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0357_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0171_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0770_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0286_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0544_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0698_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0155_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0341_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0463_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0051_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0676_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0595_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0174_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0198_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0480_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0687_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0145_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0004_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0583_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0449_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0204_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0715_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0521_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0320_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0568_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0151_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0197_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0709_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0499_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0006_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0269_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0525_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0413_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0656_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0646_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0246_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0535_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0333_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0238_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0241_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0469_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0689_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0403_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0404_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0360_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0191_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0236_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0032_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0445_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0614_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0490_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0651_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0703_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0702_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0623_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0719_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0728_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0031_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0253_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0551_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0327_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0027_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0491_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0395_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0473_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0662_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0312_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0605_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0455_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0580_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0005_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0311_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0305_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0260_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0566_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0670_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0129_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0742_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0549_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0058_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0501_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0071_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0450_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0375_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0131_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0697_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0415_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0560_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0643_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0699_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0515_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0739_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0092_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0046_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0083_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0443_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0746_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0655_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0427_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0603_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0367_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0318_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0520_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0749_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0771_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0369_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0434_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0602_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0349_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0763_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0731_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0338_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0462_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0347_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0649_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0194_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0134_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0734_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0632_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0280_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0184_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0089_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0095_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0555_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0016_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0168_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0665_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0767_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0666_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0737_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0037_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0648_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0064_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0764_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0323_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0009_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0545_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0212_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0015_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0267_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0727_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0661_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0211_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0220_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0278_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0721_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0718_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0207_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0619_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0400_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0754_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0610_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0358_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0758_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0298_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0756_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0729_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0468_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0397_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0247_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0149_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0119_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0010_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0093_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0386_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0045_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0066_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0393_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0600_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0440_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0350_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0214_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0714_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0161_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0775_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0203_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0077_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0332_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0700_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0123_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0024_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0013_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0587_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0148_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0513_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0674_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0188_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0599_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0158_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0425_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0003_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0534_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0254_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0121_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0099_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0373_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0479_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0379_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0344_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0684_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0720_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0391_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0575_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0319_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0336_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0531_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0474_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0432_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0766_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0342_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0476_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0237_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0061_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0250_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0752_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0329_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0376_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0640_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0634_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0682_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0181_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0076_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0244_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0690_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0303_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0228_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0477_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0224_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0199_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0343_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0399_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0707_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0760_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0774_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0270_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0144_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0451_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0180_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0025_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0363_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0516_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0647_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0581_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0679_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0635_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0201_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0133_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0351_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0325_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0183_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0287_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0683_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0316_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0275_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0424_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0461_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0576_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0390_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0052_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0086_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0492_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0216_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0772_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0439_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0249_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0493_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0593_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0442_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0218_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0484_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0346_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0157_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0352_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0441_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0486_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0537_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0485_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0164_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0022_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0458_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0497_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0170_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0154_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0751_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0048_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0428_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0418_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0112_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0757_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0421_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0471_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0510_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0466_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0641_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0601_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0740_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0594_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0276_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0383_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0232_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0717_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0644_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0518_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0743_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0673_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0044_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0667_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0308_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0675_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0572_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0579_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0723_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0381_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0759_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0504_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0708_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0049_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0642_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0074_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0039_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0401_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0409_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0014_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0098_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0146_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0616_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0101_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0446_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0565_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0295_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0730_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0498_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0638_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0301_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0139_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0192_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0001_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0268_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0527_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0359_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0315_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0251_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0546_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0262_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0659_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0567_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0190_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0078_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0175_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0054_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0008_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0452_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0187_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0011_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0138_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0087_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0206_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0611_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0509_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0205_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0620_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0677_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0132_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0296_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0495_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0444_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0598_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0691_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0761_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0417_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0317_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0578_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0374_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0055_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0481_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0307_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0736_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0136_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0550_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0084_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0511_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0380_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0356_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0310_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0110_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0668_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0306_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0115_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0324_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0202_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0294_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0704_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0629_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0608_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0627_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0725_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0472_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0230_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0407_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0556_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0505_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0040_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0606_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0096_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0281_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0179_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0557_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0288_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0769_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0370_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0017_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0694_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0385_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0130_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0562_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0506_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0036_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0217_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0289_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0712_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0724_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0564_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0105_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0120_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0141_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0431_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0142_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0570_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0512_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0227_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0411_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0389_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0735_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0585_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0122_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0042_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0309_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0765_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0636_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0539_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0467_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0586_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0750_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0200_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0671_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0530_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0012_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0082_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0160_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0438_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0195_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0185_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0215_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0173_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0710_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0348_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0590_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0073_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0030_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0079_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0072_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0019_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0239_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0410_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0453_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0543_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0007_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0733_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0125_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0569_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0331_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0062_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0043_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0433_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0235_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0448_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0696_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0559_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0392_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0664_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0483_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0591_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0272_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0271_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0340_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0159_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0153_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0541_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0028_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0067_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0222_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0165_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0117_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0669_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0103_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0258_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0097_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0419_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0459_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0609_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0104_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0430_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0582_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0457_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0529_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0029_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0426_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0279_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0596_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0631_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0517_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0507_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0252_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0626_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0056_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0335_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0542_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0672_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0284_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0257_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0654_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0678_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0018_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0528_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0422_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0068_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0245_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0617_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0255_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0176_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0732_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0221_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0371_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0137_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0177_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0382_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0291_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0597_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0321_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0292_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0273_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0716_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0094_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0026_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0021_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0242_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0226_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0100_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0519_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0552_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0057_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0776_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0109_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0219_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0706_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0508_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0722_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0456_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0633_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0618_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0354_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0514_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0475_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0034_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0300_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0454_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0693_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0658_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0304_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0090_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0548_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0063_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0465_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0414_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0169_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0762_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0701_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0135_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0143_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0167_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0523_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0540_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0334_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0297_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0738_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0500_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0554_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0695_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0747_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0108_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0113_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0686_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0302_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0362_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0355_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0645_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0102_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0681_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0536_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0156_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0163_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0277_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0573_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0293_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0741_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0377_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0423_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0592_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0657_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0584_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0628_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0240_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0361_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0680_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0388_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0435_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0345_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0637_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0420_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0378_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0398_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0234_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0266_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0210_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0711_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0162_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0264_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0070_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0223_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0470_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0002_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0069_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0625_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0285_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0065_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0147_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0085_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0394_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0330_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0621_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0248_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0274_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0526_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0685_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0589_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0128_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0405_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0126_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0075_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0494_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0283_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0558_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0402_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0261_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0107_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0522_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0604_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0231_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0663_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0726_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0152_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0353_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0243_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0364_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0213_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0412_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0622_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_middle/cc_en_middle-0650_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0354_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0200_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0391_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0559_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0473_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0528_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0073_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0170_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0180_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0582_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0067_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0036_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0023_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0099_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0342_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0353_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0044_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0575_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0107_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0566_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0101_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0542_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0217_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0437_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0583_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0172_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0060_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0072_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0393_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0414_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0355_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0082_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0281_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0051_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0133_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0471_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0346_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0034_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0300_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0282_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0309_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0125_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0434_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0460_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0186_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0504_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0487_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0132_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0225_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0401_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0477_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0607_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0562_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0497_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0174_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0569_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0591_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0110_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0455_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0166_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0453_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0581_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0341_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0544_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0606_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0291_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0086_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0578_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0083_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0357_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0188_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0411_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0218_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0003_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0001_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0543_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0307_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0169_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0149_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0561_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0310_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0222_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0547_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0183_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0214_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0111_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0127_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0054_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0007_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0572_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0103_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0334_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0114_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0513_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0237_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0397_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0306_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0120_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0129_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0262_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0459_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0085_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0206_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0271_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0610_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0031_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0390_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0043_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0012_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0409_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0574_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0596_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0588_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0532_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0236_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0501_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0269_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0540_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0175_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0290_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0233_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0440_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0071_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0037_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0420_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0404_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0141_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0592_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0164_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0162_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0388_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0159_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0372_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0476_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0323_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0008_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0151_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0533_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0344_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0481_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0204_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0179_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0496_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0469_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0055_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0367_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0277_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0603_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0512_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0340_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0143_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0140_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0285_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0124_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0531_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0375_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0013_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0522_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0066_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0332_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0228_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0445_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0430_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0018_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0392_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0505_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0485_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0130_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0026_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0489_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0006_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0157_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0467_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0454_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0600_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0163_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0248_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0339_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0534_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0038_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0597_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0303_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0425_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0000_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0352_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0226_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0461_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0545_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0102_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0587_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0048_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0336_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0502_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0427_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0090_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0369_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0216_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0292_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0243_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0326_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0602_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0611_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0499_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0032_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0599_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0097_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0182_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0378_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0509_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0139_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0456_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0322_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0221_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0153_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0076_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0057_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0284_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0075_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0422_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0288_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0077_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0305_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0273_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0242_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0050_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0525_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0232_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0173_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0294_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0016_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0365_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0604_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0138_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0178_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0377_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0042_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0065_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0112_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0142_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0184_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0495_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0276_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0301_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0210_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0494_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0447_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0333_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0424_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0224_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0105_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0081_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0579_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0536_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0168_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0293_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0021_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0230_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0260_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0450_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0465_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0394_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0319_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0028_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0608_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0538_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0155_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0517_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0240_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0515_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0158_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0209_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0321_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0296_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0576_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0080_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0091_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0259_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0121_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0580_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0595_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0062_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0436_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0337_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0059_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0115_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0412_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0462_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0106_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0423_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0366_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0215_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0263_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0295_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0443_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0557_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0010_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0136_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0376_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0235_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0135_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0144_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0548_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0537_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0128_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0287_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0194_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0069_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0324_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0364_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0187_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0096_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0558_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0063_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0551_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0286_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0449_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0255_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0358_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0383_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0283_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0470_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0403_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0349_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0524_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0426_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0486_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0519_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0380_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0387_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0122_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0518_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0554_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0027_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0418_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0104_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0039_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0268_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0201_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0094_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0347_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0416_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0514_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0231_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0330_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0523_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0570_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0421_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0009_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0478_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0417_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0482_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0213_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0568_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0312_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0550_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0219_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0093_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0530_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0035_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0356_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0406_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0498_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0297_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0266_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0428_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0074_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0244_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0431_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0410_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0134_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0246_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0408_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0563_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0360_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0468_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0402_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0370_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0148_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0092_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0089_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0251_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0061_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0609_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0539_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0475_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0261_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0304_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0203_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0084_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0395_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0131_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0197_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0087_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0327_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0279_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0196_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0555_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0041_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0317_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0193_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0441_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0202_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0511_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0220_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0474_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0577_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0014_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0171_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0361_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0264_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0024_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0432_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0451_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0552_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0584_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0239_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0458_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0407_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0045_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0541_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0371_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0480_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0590_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0015_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0108_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0320_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0145_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0483_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0521_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0419_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0150_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0526_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0589_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0119_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0315_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0546_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0510_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0373_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0413_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0249_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0484_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0493_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0491_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0385_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0005_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0374_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0185_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0345_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0571_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0167_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0234_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0318_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0520_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0256_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0116_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0088_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0556_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0302_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0238_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0205_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0019_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0191_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0199_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0078_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0594_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0195_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0030_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0439_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0448_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0350_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0267_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0275_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0348_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0560_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0181_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0329_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0516_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0363_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0258_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0359_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0299_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0457_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0379_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0049_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0368_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0265_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0046_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0311_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0177_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0058_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0040_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0549_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0605_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0160_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0472_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0020_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0553_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0211_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0052_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0466_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0382_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0351_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0433_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0270_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0593_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0529_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0095_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0308_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0152_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0064_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0189_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0527_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0070_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0400_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0257_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0229_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0154_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0362_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0573_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0161_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0252_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0386_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0280_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0585_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0464_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0338_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0278_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0506_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0033_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0137_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0444_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0247_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0109_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0004_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0011_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0508_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0126_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0017_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0254_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0567_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0100_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0398_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0117_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0147_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0176_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0156_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0490_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0022_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0190_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0047_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0207_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0446_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0227_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0435_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0289_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0146_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0598_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0503_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0165_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0208_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0025_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0442_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0325_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0429_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0056_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0212_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0002_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0245_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0068_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0381_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0452_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0500_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0396_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0328_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0384_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0389_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0586_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0488_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0298_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0463_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0118_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0479_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0274_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0272_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0492_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0113_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0415_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0405_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0198_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0313_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0331_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0314_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0053_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0507_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0438_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0029_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0399_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0564_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0601_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0241_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0343_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0223_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0316_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0123_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0535_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0250_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0335_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0253_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0079_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0565_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0192_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_head/cc_en_head-0098_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0247_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1166_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1192_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0818_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0166_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0529_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1356_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0858_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0823_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1339_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0627_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1370_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0341_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0185_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1259_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0981_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0515_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0545_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1167_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0077_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0665_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1195_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0085_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0566_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0230_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0311_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0055_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0810_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1270_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0966_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0517_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0843_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0348_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0797_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0736_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0943_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1054_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1105_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0556_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0849_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0492_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1121_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0817_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0874_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0579_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1250_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0146_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0589_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0169_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1084_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1041_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0526_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0551_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0193_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1402_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1106_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0782_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0659_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0588_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0990_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0833_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0845_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1098_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0402_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0878_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0930_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0046_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0440_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1293_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0393_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0049_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0305_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0868_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1126_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0531_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1087_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1442_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0997_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0366_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0165_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1078_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0957_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1002_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0269_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0460_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1397_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0250_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0951_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1246_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0876_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0302_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0564_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0584_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0622_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0694_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0335_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1189_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0215_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1390_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0204_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1038_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0713_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0567_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1130_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0221_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0538_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1232_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1265_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0628_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0090_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0968_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0248_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0885_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0977_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0749_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0527_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0985_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0934_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0993_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0088_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0674_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0171_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1160_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0640_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1419_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0488_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0704_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0887_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1055_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1258_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0924_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0390_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0612_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1276_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0744_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0399_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0053_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0025_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0371_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0161_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1444_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0051_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0367_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0036_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1398_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0434_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0965_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1226_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0093_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0702_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0949_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1343_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1480_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0503_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0979_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1039_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1034_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0725_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1185_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1288_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0742_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0242_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1071_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0975_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1211_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0435_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0841_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1303_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1380_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0558_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0522_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0181_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0098_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0318_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1101_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1183_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0054_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0962_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0072_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1283_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1269_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0379_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0109_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1257_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0683_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0932_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0724_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0259_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0752_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0748_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1456_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0038_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1311_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0224_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1181_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0292_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0006_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1077_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1376_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0082_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1107_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1305_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1263_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1484_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0470_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0232_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0312_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1478_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0176_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0825_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0587_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0726_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1069_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0478_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1470_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0751_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0034_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0557_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0087_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0554_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0680_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0095_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1193_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1290_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0956_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1433_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1409_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0546_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1137_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1042_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1361_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0983_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1099_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0675_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0000_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0540_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0593_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1182_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0984_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0463_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0406_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0963_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1112_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0572_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0894_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0901_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0539_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1462_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0504_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1196_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0916_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0840_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1059_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0895_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1439_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0521_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0227_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0157_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0134_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0091_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0080_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1405_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0856_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0355_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0904_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1256_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0886_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0703_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0870_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1307_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1463_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1278_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1176_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0999_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0352_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1216_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0506_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0942_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0164_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0071_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0362_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0285_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0819_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1447_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0794_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1111_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0452_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1008_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1064_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0786_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0275_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1004_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1020_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1198_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0258_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0179_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0473_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0706_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0829_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1348_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0489_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0721_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1418_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0657_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0182_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1268_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0677_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0211_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1199_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1161_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1159_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1060_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0203_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0634_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0214_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1292_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0168_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1342_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0105_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1346_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0145_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0174_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1393_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1295_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0888_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0673_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1412_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1337_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1089_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1031_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0542_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0135_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0604_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0905_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1028_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0760_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0052_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0361_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0408_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0514_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1144_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0086_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0016_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0363_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0026_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0013_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0048_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0343_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0991_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1151_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0325_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0209_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1325_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0042_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0162_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0902_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0625_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0482_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0502_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0225_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1190_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0498_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0198_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1146_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1197_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0889_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0873_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1013_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1212_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0107_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0005_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0528_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0101_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0792_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0638_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1253_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1449_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0761_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1330_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0268_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0372_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0757_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0031_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0815_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0074_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0771_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0936_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1355_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0764_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0516_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1460_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0394_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0178_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1005_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1103_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0899_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1204_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1061_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0327_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0233_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0057_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0922_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0996_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1457_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0513_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1323_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0426_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1428_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0879_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0297_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1395_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0333_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0202_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1171_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0746_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0111_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1459_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0806_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1332_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0003_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1082_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0745_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1187_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0803_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0326_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1131_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0678_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1458_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1465_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0662_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0715_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0043_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1029_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0108_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0450_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0465_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0125_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1052_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0988_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0487_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0872_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0096_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0865_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0643_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0507_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0032_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1228_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1076_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1229_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0219_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1341_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0384_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0583_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0236_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1267_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0682_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1324_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0672_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1237_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0256_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0689_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0801_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1392_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0630_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0041_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1027_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0173_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1476_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1455_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0172_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0228_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0812_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0980_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0781_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1280_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0141_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1046_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0656_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0137_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0234_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0909_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0410_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0836_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1313_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1326_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1113_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1040_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1345_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1492_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0831_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0293_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1063_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0309_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0353_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0597_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0011_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0189_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1174_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0474_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1079_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0066_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0697_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0734_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0756_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1488_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0238_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0235_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0274_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0417_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0316_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1451_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0864_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0127_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0900_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0244_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0291_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1205_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1344_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1351_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0441_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0213_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1143_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1320_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0437_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0927_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0324_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1125_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1421_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0735_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0573_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0002_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1416_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0776_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1430_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1322_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0180_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0272_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0896_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0543_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0570_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0859_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0255_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0263_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1299_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0835_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0039_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0611_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0369_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0732_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1485_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0045_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1138_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0067_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0642_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0961_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0249_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1221_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0496_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1312_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0599_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0497_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1413_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1382_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0661_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1251_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0560_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0350_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0018_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0354_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0007_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0594_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1003_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1375_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0298_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0199_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0555_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0419_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0400_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0608_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0789_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0618_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0212_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0493_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0633_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0920_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0621_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0129_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1033_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0982_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0300_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1000_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1284_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0241_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0163_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0971_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0455_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1124_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1464_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0332_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1225_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0160_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0132_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0953_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1396_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0319_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0014_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0867_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0765_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0731_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0121_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1340_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1414_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1149_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0897_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1140_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1406_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1491_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0811_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0047_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0907_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0184_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1202_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0595_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1333_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0149_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1234_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1289_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0331_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1415_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0769_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0935_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0216_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0940_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0762_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0445_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0378_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0280_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0376_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1047_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1045_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1056_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0356_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0210_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1386_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1110_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0476_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1163_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0598_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0511_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0279_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1254_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0115_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0365_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1298_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0839_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1227_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1282_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0030_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0254_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0658_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0978_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0851_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0130_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0357_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0152_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0952_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0834_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1436_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1302_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1210_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1445_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1328_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0188_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1152_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0340_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0534_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0986_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0892_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0062_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1173_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0009_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0537_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0058_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0286_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1350_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0837_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0068_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1291_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1108_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0158_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0425_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1007_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0717_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1314_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1097_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0131_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1425_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0050_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1432_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0257_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1487_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0793_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0655_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0339_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1109_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0151_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0830_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0912_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0700_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0959_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1156_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0798_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1371_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0023_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1483_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1357_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0122_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0462_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1021_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0562_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0505_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0787_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1420_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1399_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1024_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1296_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0454_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0844_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0133_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1377_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1037_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0705_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0623_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0950_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0547_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0240_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1011_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0893_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0117_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1178_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1120_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0346_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1354_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0880_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1019_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0477_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1231_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0660_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0471_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1168_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1214_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1378_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1142_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1102_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1015_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1438_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1372_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0945_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1248_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1119_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0064_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0973_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0509_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1135_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0019_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0663_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1025_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0261_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1431_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1403_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0004_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1374_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1164_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0740_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1217_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0387_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0911_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1203_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0995_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0535_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1319_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0795_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1184_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0915_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0576_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0329_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0929_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0142_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0413_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0520_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1080_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1334_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1207_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0869_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1012_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0475_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0167_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0494_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0264_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1358_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0641_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1318_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1194_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0059_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0578_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1095_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0692_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1422_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0195_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0431_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0928_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0388_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0690_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0467_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0415_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0119_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0187_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0424_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0716_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0854_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0252_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0635_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0755_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0294_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0719_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0785_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0914_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1364_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0021_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1242_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0220_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0139_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1220_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1383_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0102_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1424_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0921_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1215_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0722_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0908_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0805_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0414_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0190_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0089_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1150_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1360_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0453_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0827_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1273_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0670_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0891_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1329_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1261_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1147_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1262_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0359_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0328_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1240_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1391_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0156_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0989_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1219_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0923_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0654_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0698_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0065_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1287_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0383_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0532_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0260_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1349_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1086_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0788_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0113_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1363_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0687_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1083_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0918_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0585_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0392_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0253_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1043_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0449_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0568_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0421_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0747_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1175_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1373_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1304_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1474_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1482_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1385_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0317_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0603_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0519_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1090_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0020_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1489_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0871_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0510_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0104_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1018_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0245_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1440_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0758_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1247_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1010_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1179_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0552_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0276_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0679_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0571_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1017_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0301_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1477_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1450_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0712_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0676_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0577_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0684_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0881_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0644_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0076_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1446_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0154_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0284_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0824_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0231_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0701_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0790_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1224_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1454_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0405_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0177_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0267_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0944_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1368_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0389_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0610_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0128_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1085_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1057_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0459_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0027_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0759_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0458_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0472_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0816_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0022_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0832_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0097_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0407_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0820_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0777_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0941_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0821_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0796_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1022_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0967_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0548_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1117_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0197_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0175_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0218_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0397_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0850_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0710_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0592_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0447_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0607_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1249_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0436_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0784_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0728_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0646_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1104_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0443_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1154_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0194_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1206_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1233_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0170_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0299_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0605_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0033_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0037_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0631_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0877_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0565_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0626_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0723_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0650_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0590_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0938_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1132_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1479_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0313_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0651_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0926_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1274_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0828_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0602_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0637_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0423_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0681_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0411_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0223_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1186_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0591_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1441_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0533_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1366_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1466_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1423_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1075_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0939_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1327_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0647_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0420_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0330_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1016_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0600_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0606_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1096_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0246_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0530_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0344_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1471_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0774_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0733_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0283_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0561_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0466_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1032_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1188_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1285_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0029_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1081_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0063_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0739_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0954_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0448_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1331_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1218_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0688_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1169_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0799_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0866_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0853_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0955_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0910_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0490_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0861_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0948_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1336_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0349_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0808_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1277_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0601_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0044_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0315_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1230_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1255_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0271_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1051_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0481_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0970_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0412_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0040_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1068_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1091_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1394_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0862_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1461_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1453_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0070_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0852_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0012_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0484_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0140_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0919_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0288_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1490_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0863_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1129_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0737_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0791_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0884_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0708_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0667_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1452_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1048_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0609_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1049_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0336_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0800_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0196_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0931_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1072_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0207_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0144_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0395_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0648_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1389_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0582_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0433_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0553_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1088_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0391_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0304_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0110_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1310_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0398_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0446_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0574_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0772_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0686_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1335_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1417_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1388_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0855_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0457_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1093_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0903_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0669_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0860_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0754_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0289_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0946_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1275_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0024_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0310_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1153_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0958_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0385_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1115_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0550_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0653_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1243_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1030_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0775_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0382_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1369_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1429_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1400_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1213_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0709_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0842_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0691_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1066_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0766_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1139_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0499_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0699_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0200_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0239_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0439_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1155_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0525_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0523_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0079_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0416_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1481_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0727_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0307_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0138_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1427_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1475_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0429_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0206_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0375_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1448_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1352_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0270_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0103_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0778_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1408_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1223_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1472_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0813_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1384_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0629_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0337_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0848_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0124_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0753_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1410_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0243_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0714_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1001_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1317_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0112_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1260_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0251_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1272_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0624_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1158_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0444_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0925_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1050_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1134_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0695_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0508_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0615_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0265_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1316_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1148_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1315_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0226_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0320_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0237_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1353_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0287_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0262_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1067_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0430_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0001_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0360_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0017_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1468_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0501_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1141_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0998_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1145_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1473_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0396_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0913_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0380_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0544_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1411_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0106_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0442_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0418_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0491_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0368_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0296_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0483_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0536_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1306_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1347_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1467_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0969_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1036_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0010_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1044_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0619_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1437_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0282_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1162_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0750_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0126_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0192_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1362_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1435_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1359_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0685_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1469_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1338_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0273_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0060_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0229_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0358_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0743_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1294_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1136_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0155_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1122_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0707_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1301_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0814_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0083_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0303_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1208_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1116_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0512_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0205_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0847_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0464_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1365_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1266_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0278_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0075_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0201_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1308_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1235_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0056_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0061_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0917_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0401_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0617_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1236_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0974_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0693_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1245_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0524_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0541_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0351_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0099_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0614_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0403_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0321_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0370_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1379_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0084_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0666_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0783_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0028_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0208_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1244_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1300_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0883_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1239_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0308_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0035_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0486_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1271_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0078_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1127_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0377_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0652_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1443_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0711_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0838_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0322_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1222_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0290_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0857_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1062_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0186_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1114_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1073_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0802_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1123_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0281_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0770_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0620_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0581_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0456_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0094_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0092_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0645_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1426_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0334_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0809_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0422_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1170_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0153_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0147_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0100_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0636_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1026_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0639_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0081_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0720_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1094_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0381_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0342_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1387_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0347_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0191_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0409_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1118_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0159_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0485_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1009_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1165_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0116_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0586_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1381_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0668_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0992_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0073_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0664_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1264_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0183_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0822_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0338_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1200_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0479_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0972_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0826_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0780_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0432_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0373_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0767_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0469_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0779_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0217_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1177_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0364_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1014_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0461_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1209_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0976_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0804_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0549_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1180_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0120_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1401_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0559_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0933_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1157_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0114_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0480_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0964_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1023_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0580_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1053_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0266_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0438_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0875_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1286_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0123_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0729_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0937_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0295_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0277_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0069_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1006_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0994_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1100_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0960_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1407_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0741_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0374_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1065_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0730_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0306_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1074_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0222_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0008_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0763_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0563_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1172_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0632_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1092_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0906_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0613_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0890_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1070_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0136_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0947_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1238_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0882_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0495_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1128_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1279_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0575_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1201_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0404_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0773_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0898_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1321_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0846_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0518_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1367_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0345_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0015_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0451_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1297_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1241_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0987_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0150_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0696_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0807_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1035_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1434_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0649_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1404_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1281_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0768_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0738_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1309_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1133_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1058_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0427_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0468_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0500_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0148_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0386_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0616_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0428_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0118_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0569_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0323_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0718_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1252_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1191_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0143_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0314_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0671_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-1486_text_document -0.673 /eagle/datasets//dolma/data_Llama2Tokenizer/common-crawl/cc_en_tail/cc_en_tail-0596_text_document diff --git a/ALCF/data-lists/polaris/data_file_list_peS2o.txt b/ALCF/data-lists/polaris/data_file_list_peS2o.txt deleted file mode 100644 index 3f2ddfb299..0000000000 --- a/ALCF/data-lists/polaris/data_file_list_peS2o.txt +++ /dev/null @@ -1,42 +0,0 @@ -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0039_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0014_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0034_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0007_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0020_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0026_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0036_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0030_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0015_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0018_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0033_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0027_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0023_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0024_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0009_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0025_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0010_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0032_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0029_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0021_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0040_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0000_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0013_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0005_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0022_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0011_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0038_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0003_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0019_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0031_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0012_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0041_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0004_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0001_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0037_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0006_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0016_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0002_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0017_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0028_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0035_text_document -0.057 /eagle/datasets//dolma/data_Llama2Tokenizer/peS2o//s2_v3-0008_text_document diff --git a/ALCF/data-lists/polaris/data_file_list_stack.txt b/ALCF/data-lists/polaris/data_file_list_stack.txt deleted file mode 100644 index f5049cd9e4..0000000000 --- a/ALCF/data-lists/polaris/data_file_list_stack.txt +++ /dev/null @@ -1,4435 +0,0 @@ -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scss/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+django/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+django/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+django/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/text/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elixir/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rhtml/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl6/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gap/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/alloy/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/squirrel/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/webassembly/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/webassembly/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/webassembly/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/common-lisp/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rouge/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/textile/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/labview/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mask/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/batchfile/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/batchfile/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/batchfile/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/batchfile/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/batchfile/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/batchfile/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/batchfile/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/openscad/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/openscad/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/qml/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/qml/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/qml/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/qml/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/maxscript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/modelica/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/modelica/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unrealscript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pascal/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lookml/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/opal/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lua/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/module-management-system/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/opa/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ren'py/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ren'py/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smalltalk/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/maple/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/numpy/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/realbasic/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pure-data/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pure-data/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ceylon/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sql/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0102_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0104_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0105_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0106_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0103_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0107_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0108_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0101_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/augeas/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/swift/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rdoc/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/logtalk/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c2hs-haskell/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pov-ray-sdl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ioke/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/verilog/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clips/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/chuck/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stylus/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stylus/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stylus/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pod/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pod/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xojo/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/makefile/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/f-sharp/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/component-pascal/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/component-pascal/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/powershell/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/emacs-lisp/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/emacs-lisp/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/emacs-lisp/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/emacs-lisp/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/emacs-lisp/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/emacs-lisp/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/emacs-lisp/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lex/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lex/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/grace/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/processing/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/processing/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/processing/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/processing/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sas/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sas/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/netlogo/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c++/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/autoit/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/autoit/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/zephir/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/robotframework/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/propeller-spin/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xquery/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/txl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nu/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/toml/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ampl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tea/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/csound/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/brightscript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/perl/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/slim/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/slim/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/red/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/thrift/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pony/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stata/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stata/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stata/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stata/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stata/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/literate-coffeescript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ats/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/parrot-internal-representation/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lolcode/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/digital-command-language/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/abap/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/abap/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lsl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nix/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jade/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jade/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jade/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jade/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hlsl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hlsl/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scaml/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ags-script/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tex/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yaml/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clojure/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clojure/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clojure/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clojure/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clojure/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clojure/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clojure/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clojure/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xs/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mtml/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rmarkdown/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kit/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mako/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/igor-pro/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sourcepawn/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sourcepawn/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/apl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/protocol-buffer/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/protocol-buffer/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/protocol-buffer/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/protocol-buffer/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/protocol-buffer/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/protocol-buffer/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/protocol-buffer/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/protocol-buffer/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nginx/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scala/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ocaml/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/piglatin/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shell/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/solidity/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/awk/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vcl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gdscript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gdscript/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gdscript/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gdscript/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gdscript/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/applescript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/webidl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/max/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/max/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/max/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/max/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/max/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/max/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/max/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/max/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glyph/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/papyrus/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/boo/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hy/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/d/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xc/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/aspectj/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/isabelle/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/isabelle/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sqf/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sqf/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/volt/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/monkey/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lfe/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mathematica/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clarion/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/oxygene/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/metal/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nsis/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/zig/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/zig/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/zig/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/muf/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dylan/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xbase/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smt/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smt/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smt/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smt/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smt/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/autohotkey/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/autohotkey/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/erlang/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arduino/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/clean/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ston/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/creole/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ecl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elm/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elm/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elm/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elm/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/elm/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eiffel/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eiffel/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turtle/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/darcs-patch/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+php/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+php/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+php/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+php/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+php/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+php/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cython/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cython/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cython/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cython/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cython/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dm/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cmake/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/opencl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/opencl/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/opencl/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/parrot-assembly/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asciidoc/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cobol/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/io/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/saltstack/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ox/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/matlab/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/renderscript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/less/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/purescript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/purescript/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dogescript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/omgrofl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/supercollider/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/typescript/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/flux/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/viml/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/viml/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/viml/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/viml/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/viml/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/viml/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/literate-haskell/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tcsh/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fortran/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+eex/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/golo/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pawn/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kicad/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/oz/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/idl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/literate-agda/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/click/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/freemarker/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/freemarker/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/freemarker/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/freemarker/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/freemarker/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/krl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/inform-7/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smarty/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/stan/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/livescript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dockerfile/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coldfusion-cfc/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coldfusion-cfc/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gosu/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/kotlin/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/parrot/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lilypond/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/moonscript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jupyter-notebook/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/idris/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mediawiki/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mediawiki/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mediawiki/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mediawiki/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mediawiki/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/logos/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/logos/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/logos/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/logos/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fish/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/julia/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coffeescript/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asp/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asp/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asp/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asp/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asp/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asp/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/asp/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/systemverilog/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/systemverilog/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/systemverilog/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/systemverilog/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/systemverilog/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/systemverilog/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/org/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/org/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/org/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/org/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/org/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/css/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/actionscript/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/myghty/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/arc/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphviz-(dot)/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/latte/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/turing/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/java-server-pages/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/bison/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/emberscript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xpages/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/linker-script/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/linker-script/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coldfusion/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/eagle/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphql/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphql/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/graphql/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cycript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scilab/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gentoo-eclass/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/go/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy-server-pages/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy-server-pages/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dns-zone/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scheme/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scheme/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scheme/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scheme/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scheme/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scheme/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/scheme/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/hcl/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/netlinx/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/irc-log/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mirah/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rust/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xml/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haskell/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/visual-basic/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/purebasic/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/handlebars/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cartocss/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/j/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jflex/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unity3d-asset/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/qmake/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/api-blueprint/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gettext-catalog/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pan/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/brainfuck/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/edn/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ecere-projects/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haml/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haml/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haml/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haml/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fantom/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tcl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tcl/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tcl/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tcl/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tcl/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tcl/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/tcl/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/grammatical-framework/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/befunge/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sass/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sass/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sass/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sass/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/restructuredtext/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objdump/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ada/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ada/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ada/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ada/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ada/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ada/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ada/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vhdl/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/twig/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/urweb/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ruby/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/blitzmax/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/liquid/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/liquid/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/genshi/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/g-code/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ninja/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ninja/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gams/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lasso/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/desktop/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/postscript/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/agda/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/agda/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-c++/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-c++/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-c++/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-c++/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-c++/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-c++/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-c++/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-c++/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/m4/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/m4/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/coq/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/php/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gentoo-ebuild/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gentoo-ebuild/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/factor/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/uno/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/apacheconf/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pogoscript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nimrod/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nimrod/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nimrod/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nimrod/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nimrod/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nimrod/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jasmin/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/javascript/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/html+erb/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/bluespec/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nit/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ec/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/raml/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/rebol/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/diff/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/objective-j/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/bro/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sparql/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/chapel/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/pike/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ini/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/harbour/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/llvm/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/crystal/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/crystal/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/crystal/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/crystal/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/crystal/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lean/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lean/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/lean/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/unified-parallel-c/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/c-sharp/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xtend/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ooc/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shen/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/self/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/m/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/x10/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cirru/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/redcode/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/mupad/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/markdown/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/glsl/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/inno-setup/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/smali/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groff/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/wisp/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/haxe/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cucumber/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cucumber/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cucumber/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cucumber/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/http/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/http/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yacc/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yacc/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yacc/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yacc/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yacc/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yacc/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yacc/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/forth/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yang/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yang/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/yang/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/r/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/r/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/r/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/r/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gnuplot/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xslt/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/ragel-in-ruby-host/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nesc/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/nesc/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/standard-ml/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/standard-ml/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/standard-ml/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/standard-ml/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/standard-ml/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/standard-ml/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/slash/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/dart/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/zimpl/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/sage/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/blitzbasic/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/octave/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/fancy/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/antlr/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/bitbake/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/bitbake/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/gas/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/web-ontology-language/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/jsx/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vala/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cap'n-proto/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/prolog/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/cuda/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/shellsession/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/python-traceback/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/groovy/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0087_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0025_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0019_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0015_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0009_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0064_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0050_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0058_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0093_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0059_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0051_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0021_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0071_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0008_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0044_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0072_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0097_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0068_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0080_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0057_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0047_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0052_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0067_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0014_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0023_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0034_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0032_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0007_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0060_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0086_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0094_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0091_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0017_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0090_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0066_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0046_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0096_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0100_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0020_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0076_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0092_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0048_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0049_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0075_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0026_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0012_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0095_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0074_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0083_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0054_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0070_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0082_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0055_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0024_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0037_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0088_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0078_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0065_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0062_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0089_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0099_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0061_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0041_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0079_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0035_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0002_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0028_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0045_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0040_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0004_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0098_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0038_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0056_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0063_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0011_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0003_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0077_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0042_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0013_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0029_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0039_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0022_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0084_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0085_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0069_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0081_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0006_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0073_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0027_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0010_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0043_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0005_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0016_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0033_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0018_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0036_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0030_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0001_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0053_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/vue/v3-0031_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/xproc/v3-0000_text_document -0.172 /eagle/datasets//dolma/data_Llama2Tokenizer/stack-code/racket/v3-0000_text_document diff --git a/ALCF/data-lists/polaris/data_file_list_wiki.txt b/ALCF/data-lists/polaris/data_file_list_wiki.txt deleted file mode 100644 index 134c1473b1..0000000000 --- a/ALCF/data-lists/polaris/data_file_list_wiki.txt +++ /dev/null @@ -1,2 +0,0 @@ -0.0045 /eagle/datasets//dolma/data_Llama2Tokenizer/wiki-en-simple//en_simple_wiki-0000_text_document -0.0045 /eagle/datasets//dolma/data_Llama2Tokenizer/wiki-en-simple//en_simple_wiki-0001_text_document diff --git a/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt b/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt new file mode 100644 index 0000000000..6f34558ec3 --- /dev/null +++ b/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt @@ -0,0 +1,2419 @@ +0.0018520780893211373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document +0.0017591050606817512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document +0.001459052794333798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document +0.0007405667281569194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document +0.00019420030110896795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document +0.0009008668715801845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document +0.00015115827957143057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document +0.0014552844319220648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document +0.0012469861325685161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document +0.00136412011372413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document +0.0007064279699221103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document +0.0008472240000687427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document +0.0001984375713341955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document +0.0005472773881697123 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document +0.001815779629850992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document +0.0018313600689757324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document +0.0002583902668716813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document +0.0002646575141232155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document +0.0003165521247456758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document +0.0002920706460176214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document +0.00028396813182810215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document +0.00030445161883108107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document +0.00031628781276576474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document +0.0003083776568189157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document +0.0003176359471472902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document +0.0002536009369131698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document +0.0003067491424681363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document +0.0002597217257557784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document +0.0003788556450109768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document +0.0002796563272052598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document +0.00033573826524290287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document +0.00030523658022800287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document +0.00032211552192240096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document +0.0003329295675164247 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document +0.0003101982186639862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document +0.00032361798234223355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document +0.0003495541581652915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document +0.0002821637448858042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document +0.00030399523537629673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document +0.0002955658968247219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document +0.00028942158502924254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document +0.00028769546171490733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document +0.0002938111057234182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document +0.0002711150403010948 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document +0.00031130095874747565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document +0.0003002996118160777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document +0.0003732757901604459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document +0.00026784205751795894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document +0.0002799626521661984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document +0.00034334276069078164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document +0.0003582469803674965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document +0.00031094844818418623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document +0.0002766228384977191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document +0.00030297116159471485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document +0.00027033888377464685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document +0.00030090862368377933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document +0.00028543875802490955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document +0.00027559768459074204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document +0.0003182185533962886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document +0.0003311392971435837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document +0.00028751652060804325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document +0.000303466863212589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document +0.00033400462801277524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document +0.0002589234031777426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document +0.0002913508598466723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document +0.0002670572450004856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document +0.00032027399105647656 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document +0.00032188376258379377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document +0.0003161585784100882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document +0.0003184249182974135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document +0.00030381336664000807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document +0.0003190437442184283 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document +0.0002537961798200545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document +0.0003017817117223326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document +0.00028685268513240224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document +0.00031265179094451165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document +0.00034708319096986816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document +0.00026650837943080664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document +0.00034588832248507335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document +0.0002416982248399037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document +0.0003089296918222243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document +0.00029137184185700827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document +0.00026464226846800774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document +0.00030545397919456627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document +0.0003206778460448875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document +0.00030968971641110967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document +0.00023325653928600864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document +0.00030526899198338555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document +0.00035376719076633584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document +0.000290224385981026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document +0.000294650083382008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document +0.00028768858128616436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document +0.00030856965235527843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document +0.00030579942447879054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document +0.0002863101084704357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document +0.0002870032092492213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document +0.000264182727569885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document +0.0002974012367036449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document +0.00032238412143059203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document +0.00031683716893819036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document +0.00031157434937617524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document +0.0003411742735695989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document +0.00026778444816570715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document +0.0003037045797275201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document +0.00027746114370081314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document +0.00027148285946862043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document +0.00028042950114678207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document +0.0003235607816590721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document +0.0003086692227306295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document +0.00033990349455148105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document +0.00030945053208470265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document +0.00027309074552265303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document +0.00028737393506316194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document +0.0003098868328009879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document +0.0002614229162588409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document +0.0002884388407820923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document +0.0031025147279277244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document +0.003102019887362634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document +0.0009996745994661548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document +0.0002406272620255565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document +0.0002404825539493424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document +0.00024062296575435581 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document +0.00024069315766818953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document +0.00024055829162263452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document +0.00024062053397343032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document +0.0002410715545206964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document +0.00024024881846087368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document +0.0002407074700790688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document +0.00024072141428809043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document +0.00024027710230872736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document +0.0002409111299205489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document +0.00024081954058275009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document +0.00024086076794990912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document +0.00024098672620832446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document +0.00024068622303333862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document +0.00024140627024291824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document +0.0002414512033594384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document +0.00024028742594941463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document +0.00024018036089269645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document +0.0002398347365034979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document +0.00024006780153485276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document +0.00024015620270419213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document +0.0002408848259695227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document +0.0002408023185278831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document +0.00024021196580140326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document +0.00024077677271297493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document +0.00024087392454668027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document +0.0002408071293824126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document +0.00024042223828845715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document +0.0002411484752360495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document +0.00023605263746465907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document +0.00023471222158326908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document +0.00023432138580287644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document +0.00023407385623382327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document +0.00023487504174367091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document +0.0002341843704976313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document +0.00023421993170282486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document +0.00023445057969132037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document +0.0002337681680073047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document +0.000234627964808109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document +0.0002338942211888584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document +0.00023403849286843386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document +0.00023405641310796305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document +0.00023349169562397965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document +0.00023381157386048856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document +0.00023388742993790587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document +0.00023363103829469813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document +0.00023421141834630477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document +0.00023420564352232565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document +0.00023367463699173143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document +0.00023344969163567033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document +0.00023372196941547188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document +0.00023399207645297834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document +0.00023357915605505856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document +0.00023337585642190864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document +0.00023385005470157914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document +0.00023301533534493465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document +0.00023377864302541782 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document +0.00023323745848621437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document +0.0002330594611151835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document +0.0002334149675026783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document +0.00023198945902291534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document +0.00023023784834634142 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document +0.00022985623060187217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document +0.0002292605284569516 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document +0.00022926593333048894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document +0.00022922766406807777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document +0.00022898153911167426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document +0.0002292473111593315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document +0.000228804579400424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document +0.00022865485613513526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document +0.00022937426835887895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document +0.00022917388311587372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document +0.0002291660582019043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document +0.00022907895248360543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document +0.0002294617879920205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document +0.0002290452150516566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document +0.00022943405619715553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document +0.0002296271421006204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document +0.00022854791372910372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document +0.00022923123467686557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document +0.00022852404355738494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document +0.00022847798660086642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document +0.0002289604586810316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document +0.00022835479834950643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document +0.0002289149402884243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document +0.00022806655474763446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document +0.00022826296420992974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document +0.00022906829636213627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document +0.0002287628414466998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document +0.0002282673911253445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document +0.00022869309841939134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document +0.0002281540116815451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document +0.0002259755756162738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document +0.00022562331285233504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document +0.0002259061146106053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document +0.00022567670836663787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document +0.00022573165387587061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document +0.00022508514961670572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document +0.00022564642513773356 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document +0.00022563088621998788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document +0.0002250438755373707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document +0.00022524465346241134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document +0.00022531737657666812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document +0.00022444687519363458 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document +0.00022460397498596298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document +0.00022454218976501763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document +0.00022447528843671366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document +0.00022501666332178926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document +0.00022453752304377972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document +0.00022484451871163002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document +0.00022465678847154914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document +0.00022453180917044732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document +0.0002247278486823009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document +0.00022465794828242097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document +0.00022431000701925386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document +0.00022476020248460963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document +0.00022467531771795015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document +0.0002236391309945234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document +0.00022458764920536007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document +0.00022430877426744415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document +0.0002247047786127192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document +0.0002245298090400035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document +0.0002245648831396188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document +0.00022292894729820784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document +0.00022236668082957533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document +0.0002217622659895442 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document +0.00022252452726732609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document +0.00022135333211363678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document +0.0002214571757787971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document +0.0002217188139237798 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document +0.00022144214894640303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document +0.00022100172806631854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document +0.00022156392409199052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document +0.00022134830143710272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document +0.00022158598922529453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document +0.00022142932483041377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document +0.00022120980907786554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document +0.00022117917738112441 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document +0.00022077089397851235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document +0.00022093265074996711 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document +0.00022091299741377004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document +0.0002205849150703338 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document +0.0002210648204787979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document +0.0002214235747364102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document +0.00022083907302221787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document +0.0002206334237915964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document +0.00022065193929912214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document +0.00022079775597767288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document +0.00022091492909963518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document +0.00022095009987097293 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document +0.0002208150577180165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document +0.00022085759102772088 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document +0.00022073789170129016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document +0.00022049322781182384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document +0.00022083270617761285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document +0.00021982452827473632 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document +0.00021899870446514259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document +0.00021890358773356361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document +0.00021875556609042841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document +0.00021861195987201226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document +0.00021856782186167455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document +0.00021912837771543515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document +0.00021900213768517756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document +0.00021871675851390374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document +0.0002180537056545586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document +0.0002188196714327129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document +0.00021851362624523464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document +0.0002183236795498736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document +7.291153618675672e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document +0.0003742481815405742 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document +0.00038204855962733055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document +0.00038821818392663593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document +0.00038723332988783727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document +0.00038916141142149904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document +0.00038049542523949033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document +0.0003854755539534284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document +0.00024202756466512517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document +0.0003915405155008087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document +0.0003927382151931033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document +0.0003839151202260479 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document +0.00040006817468967907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document +0.00040318965964443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document +0.0003831013019452741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document +0.00039166638383204036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document +0.00039962784023961004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document +0.00039536707853602614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document +0.0004204304698247758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document +0.00041538899178693555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document +0.00039186953333675306 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document +0.00038945837196504305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document +0.0003919951238929062 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document +0.00044377065718528966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document +0.0004407759068603017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document +0.0002487811895843715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document +0.00039349432045556636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document +0.00041223198559462343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document +0.0004036573014830213 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document +0.0003825982215521807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document +0.00040386867133151386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document +0.00024460575279105167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document +0.000269029789531335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document +0.0003573757493252864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document +0.0004600876681392076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document +0.0002605354166397086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document +0.0003882502452157999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document +0.0002466747612126512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document +0.0004024726105072402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document +0.00040820631128483644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document +0.0002691094350403538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document +0.00026916830387277267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document +0.0004204663297880574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document +0.00042379698687085554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document +0.0004502169227311871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document +0.0002661708937015295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document +0.00031239486948031334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document +0.0003109054589936201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document +0.00045873053079760646 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document +0.00022904931423244635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document +0.0003813462028433663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document +0.00039188129256500874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document +0.00045124222276983765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document +0.00048138658436853695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document +0.0003944178776279866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document +0.00039941569676754006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document +0.00037952761190240494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document +0.0003944870860881476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document +0.0003891842411856621 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document +0.000387688981934861 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document +0.00039197953876258005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document +0.00039007915280311206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document +0.0003995520363699188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document +0.00039230985654592406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document +0.0003929472067173851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document +0.0003924096172671473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document +0.0003881636143629905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document +0.000389790617937084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document +0.00037351762309221023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document +0.0003630196170929407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document +0.00033532465765142113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document +0.0003076088685761823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document +0.00039463850897720803 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document +0.0002843816115231449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document +0.0002909175709416474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document +0.00028867170997202486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document +0.0002838644617723659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document +0.00029027869525543416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document +0.0002821339567560056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document +0.0002922988877045601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document +0.0002866955958315786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document +0.0002865271754558126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document +0.0002861247475618473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document +0.0002826681072408606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document +0.0002849746458282827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document +0.0002816966633435316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document +0.00026255342235948463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document +0.0002552895098829678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document +0.00025990194083107813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document +0.0002524062657685835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document +0.0002538577379748611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document +0.0002561415177406761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document +0.00026206253059694905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document +0.00026168095406910565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document +0.0002601305742008613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document +0.00025200823006814814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document +0.0003229951981263502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document +0.00037289448266476045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document +0.0003807825862179898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document +0.0003616333738191483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document +0.0003665117918907636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document +0.0003684186453633228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document +0.0003589330610806066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document +0.00036383861418030395 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document +0.000359841363355303 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document +0.00036431044063050464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document +0.0003668574090358279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document +0.000362768263620199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document +0.0003501888032771077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document +0.000352401968221528 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document +0.0003541019701869794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document +0.0003628121865546891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document +0.0003752582953758773 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document +0.00037902046230424966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document +0.0003777927146925147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document +0.0003760676130509053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document +0.00034046049078755405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document +0.0003338847563259091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document +0.00033294499102761794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document +0.0004912026198265864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document +0.00032064363474664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document +0.00032154190389541214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document +0.00032309660151746207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document +0.00031181143365304544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document +0.00031046092294569104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document +0.00031150165249068046 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document +0.0003041314265988224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document +0.0003024834909739394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document +0.0003019936835833604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document +0.000292329665283177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document +0.0002867061143144972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document +0.00028443615610701707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document +0.00028462291013755945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document +0.0002793538601205013 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document +0.00027306573977044246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document +0.00027097155673336525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document +0.0002752934202112985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document +0.00043042012694697647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document +0.00047495648822986177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document +0.00047755032493473855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document +0.0004706974343933747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document +0.00046682163297771817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document +0.0004616765425874178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document +0.00030644496751628097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document +0.0002909492555358308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document +0.00027272036068261724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document +0.0004101070217315588 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document +0.0003728914338834357 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document +0.00036546911442305647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document +0.0003669945482407483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document +0.0003715902407424017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document +0.00035837486406683366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document +0.0003573318538685469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document +0.0003553784893071916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document +0.0004920659809912352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document +0.0004533619411303183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document +0.00045067066057818706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document +0.00044396985139270645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document +0.00043198288204468477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document +0.00043005174223738454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document +0.00041847118430776784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document +0.00042952036375796664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document +0.00043420594647324267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document +0.0003461123241053012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document +0.0003408581597849182 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document +0.00033172705422182547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document +0.0003392566490686136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document +0.00033578341518385483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document +0.0003439196710518844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document +0.00034559163447085543 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document +0.00033762478642902825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document +0.00033215210055107224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document +0.00033423579608014966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document +0.0004963355016025102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document +0.0004996862761456923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document +0.0005000551829325451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document +0.0005004212610098755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document +0.00027768695585500585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document +0.00028395983854338433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document +0.00027835826303062254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document +0.0002740073176010804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document +0.0002791830529274016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document +0.0002796863816194411 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document +0.00026697453022672804 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document +0.0002594197440280141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document +0.0003779565697649222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document +0.00041835823476586606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document +0.00043788493575265915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document +0.0002731731970096006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document +0.000276305847423402 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document +0.0002704955773958623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document +0.0002629635944827518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document +0.000260070956974436 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document +0.00025661553791456334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document +0.00025794727207576157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document +0.00025295733980001527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document +0.0003788106407021029 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document +0.0004882344027669431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document +0.0003275324309642705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document +0.0004803401856640094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document +0.00046720138323433943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document +0.00043527810307095335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document +0.00043905395741627827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document +0.00048774175867331425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document +0.00048380704121346737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document +0.0004779011848346118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document +0.00046255587581908036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document +0.00045127922880511576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document +0.0004503891485256095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document +0.0004450142332303422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document +0.00044630282482516654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document +0.00044325014465743616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document +0.0004263874842796447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document +0.0004217530913646938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document +0.000415120314341852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document +0.00040987168279144537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document +0.00033468337266607834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document +0.0003353094464683005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document +0.0004833936821707294 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document +0.00047194878988920935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document +0.0004648324126996427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document +0.0004562345003964941 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document +0.0004933203505465098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document +0.0003530166075325466 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document +0.00035368548192804685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document +0.0004872620828289663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document +0.00048293889392426456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document +0.00047936768462267655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document +0.00047821013991587545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document +0.0004660610308564753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document +0.000394683430103437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document +0.00039165053441571324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document +0.0003906936040164381 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document +0.00038074803919159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document +0.0003686529291578143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document +0.00035832920428870976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document +0.00035929024535947033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document +0.0003538226556050544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document +0.0003584167868708799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document +0.0003480507542594234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document +0.0003413709023543034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document +0.00034001304759361455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document +0.00033430532902756514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document +0.00046519252660631277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document +0.0002938876402514769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document +0.00028676090994509047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document +0.00027296150117506716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document +0.00026513502621960483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document +0.0002680081327926125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document +0.00025831225828720344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document +0.00026647037295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document +0.0002525733734572654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document +0.00025831708887575375 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document +0.00042487627444443476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document +0.0004951213245023891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document +0.0004804051413177752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document +0.0004662397611340532 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document +0.0004550138655253933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document +0.00044494909122746795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document +0.0002899112253051385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document +0.0004372879736279761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document +0.0004529568099252922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document +0.00045127826158829573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document +0.0004436558176737439 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document +0.0004419233237678378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document +0.000434589215880319 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document +0.00029153613207706566 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document +0.0004312458058738854 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document +0.00028741854968757313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document +0.00046853200754421234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document +0.0004949145252030074 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document +0.00044459683920483167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document +0.0003836095306696336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document +0.0003789760237872398 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document +0.0003749227438304427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document +0.0003628558277173369 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document +0.00039468301394041474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document +0.00038874701821614864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document +0.0004158492456077867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document +0.00042360504554060077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document +0.00040386729844317623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document +0.00027595096702902474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document +0.00043638766787829135 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document +0.0002218691596850179 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document +0.0004437566108089954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document +0.0003889996411609667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document +0.00043454421906537704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document +0.0004522564392830988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document +0.00041517835659357416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document +0.0002614360863446896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document +0.00037543522111463596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document +0.0004386190133514781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document +0.00046358333286115075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document +0.00043186261317942404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document +0.0002377581602097957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document +0.00025973334085074254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document +0.00040139099332000796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document +0.00043674860686687174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document +0.00040853289309329373 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document +0.000242910191729688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document +0.0004431071731750582 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document +0.0004388092670482523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document +0.000381418866255965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document +0.0004100117296419717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document +0.00042469230366022745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document +0.00041744151905374254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document +0.00022835699906752945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document +0.0004380161085387397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document +0.00044803212381807456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document +0.00040554932796137236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document +0.0004234508646347761 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document +0.00043341209652360653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document +0.00023966604734537185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document +0.000259165907316014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document +0.0004270653021833602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document +0.0004341547032162028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document +0.0004111478117275994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document +0.0004299383567984396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document +0.0004241899124590779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document +0.0004502719349364145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document +0.00038994621469645615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document +0.0003859912398894952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document +0.0004247535950310557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document +0.000386982084327716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document +0.0004196451040053251 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document +0.0004096278509782259 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document +0.0004373334932695721 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document +0.0004180889975240641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document +0.00042079636929672745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document +0.00038063574611812913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document +0.0003817505891515542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document +0.0004420096268860222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document +0.00039182670726410623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document +0.0003635667850372299 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document +0.00041564996472055667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document +0.000400529358757286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document +0.0003939113874958451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document +0.00039066622068940996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document +0.0004290098538807143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document +0.0004240739958197099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document +0.00040775392659215333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document +0.0004091634200396925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document +0.00042299190476617914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document +0.0003701492680344151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document +0.0003807353844384635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document +0.00038813507771983156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document +0.00040072346558408346 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document +0.0003603595180423597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document +0.00038799421353112465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document +0.00037575235582264926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document +0.0004239190342959713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document +0.0004606044799136546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document +0.00045107950652529253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document +0.0004391947201871058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document +0.0004457516661123035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document +0.0004301297170991686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document +0.00044661704164586694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document +0.0004438849846114837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document +0.0004444205734316823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document +0.0004190924165303394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document +0.00043942581131677875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document +0.00021568459798090663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document +0.0003814929225407199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document +0.0003217453179359235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document +0.00031719591470267974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document +0.00032434115726922137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document +0.0004079911120371051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document +0.000329492766381148 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document +0.0003845916162001633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document +0.0003835208964390098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document +0.00037847334157173194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document +0.00038296039903791865 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document +0.00037896336828472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document +0.00037620974396391355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document +0.00037420590727111843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document +0.000340490625886403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document +0.0003078314411035827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document +0.00034153990750656097 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document +0.0003308858103982067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document +0.0003452640607156025 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document +0.00033095276418403455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document +0.0003116308995860414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document +0.00032446713226408477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document +0.0003015816821912984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document +0.00031612418775706894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document +0.0003278516344971041 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document +0.00033079446736097217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document +0.00032278977146550837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document +0.00032065272988207914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document +0.0003936696452406576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document +0.0003450109536627789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document +0.0003339787189919641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document +0.0003284303856176974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document +0.00033652677276843477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document +0.0003257822443845694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document +0.0003293985569149334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document +0.0003310360260148262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document +0.0003233770986418526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document +0.0003172280092149422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document +0.0003160674744292835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document +0.00030931090289598506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document +0.0003093173886443107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document +0.00033167847081104083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document +0.00031131501311729723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document +0.00031046608876279845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document +0.00030569235942207244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document +0.00030777943671285197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document +0.00029303314290956683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document +0.0003045824546400205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document +0.00030360880677729793 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document +0.00031646239964835433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document +0.0003129122300603785 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document +0.00031060464956661433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document +0.000311819032500067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document +0.0002977872483902282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document +0.0003009448600922438 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document +0.00028610292098537774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document +0.0002988326876216654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document +0.00028550828372819075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document +0.0002830381750875739 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document +0.0002848495855927156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document +0.0002856443760308144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document +0.00027442895344188584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document +0.0002681160554049462 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document +0.0003421482544126989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document +0.0004005872948449718 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document +0.0003930123959320308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document +0.0003867271832275778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document +0.000380805140455254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document +0.0003814769861947819 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document +0.00038025170883282324 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document +0.0003738026647867475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document +0.00018960856915036276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document +0.0003697177501953134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document +0.00036674194328136693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document +0.00036447406838697555 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document +0.00036686410861101255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document +0.00035915267825103423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document +0.0003624758404026675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document +0.0002822812140180794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document +0.00030620512946920813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document +0.000294249776520589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document +0.00030238536967523434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document +0.00029509593361580754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document +0.0002906912701830899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document +0.0002921944165474959 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document +0.00028358919691127954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document +0.0002813182772323272 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document +0.00027442640800299205 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document +0.0002747820342933984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document +0.0002747584403979717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document +0.00027499129634862444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document +0.0002712050404257197 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document +0.0002616256943143254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document +0.00026769938929002815 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document +0.00038396081322727017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document +0.0003863140490027991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document +0.00037702277513203237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document +0.0003633274156107032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document +0.0003587473889240435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document +0.0003507672084278415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document +0.00033776425499780385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document +0.0003377914127574796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document +0.00032948015659161326 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document +0.00033245638541392985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document +0.00031080707640648695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document +0.0002976903331149755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document +0.0002965121463725523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document +0.0002933849695266647 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document +0.0002837035078508233 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document +0.00028684569079589323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document +0.0003145192320802359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document +0.0003566937253273515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document +0.0003470199109592918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document +0.0003060245312041868 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document +0.0002650817213818789 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document +0.0002643604938780134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document +0.000299350876031416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document +0.0003178540797697938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document +0.000271850367887767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document +0.00031349896596549 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document +0.00031749734412765755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document +0.0003791137842391209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document +0.0003742334169957992 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document +0.0003705639757351107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document +0.0003126986769797042 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document +0.00031038132814561196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document +0.00036464437173804883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document +0.0003569480488951322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document +0.0003541239221619106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document +0.00035315297411308053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document +0.0003572451925404141 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document +0.0003514986129411253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document +0.0003521798298425866 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document +0.00034553677439244716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document +0.000349004719809412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document +0.0003468247484872769 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document +0.0003465822608356558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document +0.00035410983132162007 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document +0.0003487908354969444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document +0.0003479024763238147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document +0.000341412530646823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document +0.00034451316273667034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document +0.0002618849993484869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document +0.00026788679978901144 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document +0.00027450670773227214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document +0.0002661273129899329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document +0.00026836569676402957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document +0.00026155876975483236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document +0.0002609276830117151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document +0.0002644161630512771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document +0.00036789208972872557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document +0.00037829849439990513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document +0.0003788894943523098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document +0.0003617207777959397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document +0.0002541334487248998 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document +0.0002707945538071073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document +0.00027046282716455214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document +0.0002652443167243215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document +0.0002685859923850986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document +0.00025734961751176414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document +0.000259041720872915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document +0.00025340107274823446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document +0.00025757135121837893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document +0.00025617700500574084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document +0.0002566931670562857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document +0.0002543871190716101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document +0.00024997565589481713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document +0.0002954079779456287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document +0.00034890741135252835 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document +0.0003473298137731525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document +0.0003296959618486435 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document +0.0003304520061604598 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document +0.00032377956175729824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document +0.00031700696295168713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document +0.0003060382346081943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document +0.0003012003005056863 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document +0.0002981074073993884 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document +0.0002922128825950705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document +0.000348901087722931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document +0.0003408286289467841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document +0.0003410649680770183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document +0.0003358524215576502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document +0.0003343661874989231 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document +0.00032810573699389156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document +0.00032261449539097497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document +0.0003162694866049203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document +0.0003158381156468853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document +0.000317376061083603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document +0.0003125788639953052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document +0.0003010105041885602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document +0.0003065865059090678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document +0.0003084275726508053 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document +0.00030966560718296085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document +0.0002957728057853081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document +0.00029904164542325336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document +0.0002955358888729187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document +0.00028692976446931544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document +0.0002923476214935797 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document +0.0002893691697212419 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document +0.0002855895211981585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document +0.00027968347097626246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document +0.0002810783462604979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document +0.00027794080455729715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document +0.00034784376461416953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document +0.0003488347959010943 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document +0.00034790583710250724 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document +0.000345913166618151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document +0.00033801936268066675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document +0.0003290591130212315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document +0.00034051399521366823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document +0.00032470943131841784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document +0.00031679540050914276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document +0.00031814596342422325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document +0.0003156466289485036 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document +0.00029985010879003633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document +0.0002905176377776361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document +0.0004206836775460856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document +0.00020660449162246918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document +0.0003461727254468087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document +0.00020592870907067763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document +0.00034173505299233005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document +0.0004052437256652738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document +0.0004080650901351697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document +0.00039778184149144276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document +0.00039046311464950275 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document +0.00039043444911071384 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document +0.000388575704932843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document +0.00019737533145666597 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document +0.00037610755595812403 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document +0.00037315400127598317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document +0.00037415028580922163 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document +0.00036694041707212337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document +0.00018947219857306515 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document +0.00037046050826533545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document +0.0003587440768559087 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document +0.00034623936498708903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document +0.0003502289592617922 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document +0.00034692398063649823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document +0.000339340809421849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document +0.0003360510394816983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document +0.0003354673850814145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document +0.00032937682875877047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document +0.00032844505049317715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document +0.00028287199339908627 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document +0.0002795217197003578 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document +0.00028048955601883463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document +0.0002769326396439027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document +0.0002727090021299243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document +0.0002726577841024554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document +0.00026663619593455374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document +0.00026068042672138127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document +0.0002637704114326801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document +0.0002593043567100412 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document +0.0002599897110113453 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document +0.0002435078682758859 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document +0.0002450530071379054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document +0.00024233331983743606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document +0.0002934750947999535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document +0.00033241226364044474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document +0.00032938406090272075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document +0.00032778705403953246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document +0.00032184551480398754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document +0.00031874002264945737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document +0.0003165319685666433 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document +0.00031307071173376295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document +0.00031119524184911957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document +0.0003102253344576429 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document +0.0003088976240383192 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document +0.0002951410823077708 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document +0.00029772657676757413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document +0.0003056048989909935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document +0.00031991305381648026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document +0.00030890256978362426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document +0.0003109382904091933 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document +0.00031035798529690644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document +0.00030741666395911753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document +0.0002989918594861846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document +0.00029569635443989434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document +0.0002973992445667285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document +0.000293397351001072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document +0.00028737817438047954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document +0.00028252738144009747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document +0.0002805511898623541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document +0.0003718020784620472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document +0.0003499713845765235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document +0.00034283547445326676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document +0.00031464759888838765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document +0.00033188946446414833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document +0.000326084432195463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document +0.0003764568303917893 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document +0.0003604955598858414 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document +0.0003655654554133222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document +0.00035762304033750504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document +0.00038478883950347103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document +0.00027735714341247454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document +0.00028139534607773563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document +0.00019777292251713763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document +0.000285571704874486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document +0.00028543482146244363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document +0.00019434234484256758 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document +0.00027854908176986763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document +0.0002847068039566143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document +0.00028672356943064853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document +0.00027782687605808177 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document +0.0002843539634105203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document +0.0002894748379090401 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document +0.0002868852440186493 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document +0.0002818504885373851 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document +0.00028680112812941034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document +0.00019258978168723977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document +0.00028760637934715155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document +0.0002820439443912918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document +0.0002831001054410018 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document +0.00029001901552467397 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document +0.00027779449377883156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document +0.00019949837437516796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document +0.0002907306472984446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document +0.00027814858381318327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document +0.00019472790889161432 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document +0.00020472626596924125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document +0.0002870045081974301 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document +0.00019812241927078482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document +0.0002817553333369554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document +0.00027829782796642117 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document +0.00028289431732284113 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document +0.0002795526296717729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document +0.00027682829988044574 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document +0.0002895432402719184 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document +0.0002823174903941811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document +0.00028170972351837796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document +0.00027807915877838826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document +0.00028588515681452956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document +0.00028112324090816726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document +0.00020636178289985485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document +0.00019447255290980535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document +0.0002850824220591452 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document +0.00027856429520116784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document +0.0002820880676635633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document +0.00028943902215995714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document +0.0002676366291085329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document +0.00023806333809954687 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document +0.00024526460430233455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document +0.00023876876664622726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document +0.00023379770334179805 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document +0.00024175151269138382 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document +0.00023386583242595706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document +0.00023771797150160827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document +0.0002262748967483896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document +0.0002408148346432682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document +0.00023398651720444235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document +0.00022989433874474592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document +0.00023948500543957772 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document +0.0002331594076859196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document +0.00023375132439600242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document +0.00023923410909668642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document +0.00023952796315562954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document +0.0002327466076905069 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document +0.00023082758956797212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document +0.0002240509275524448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document +0.00022798879995765268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document +0.000221172516774386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document +0.00021767045123534623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document +0.00021982832794804484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document +0.00021971626543789102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document +0.00022566565206920132 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document +0.0002181984894194856 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document +0.00021831417549554653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document +0.00021601405421187145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document +0.00022275733725519607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document +0.00021847734911973986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document +0.0002243591012664014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document +0.00021688758139483833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document +0.0002182953624789215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document +0.00020475155724026002 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document +0.00021498078062960065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document +0.0002157914337233064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document +0.00021781838494967963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document +0.00021723242266814558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document +0.0002176782686553837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document +0.0003486179404943968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document +0.00034882846352857634 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document +0.00031400868448352596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document +0.00030273484020011963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document +0.00029895889118145404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document +0.00029770764609621714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document +0.0002990181332116852 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document +0.00029653733972285996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document +0.00029624649222942476 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document +0.00029625609720203576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document +0.00029731928930852147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document +0.00029011721326148513 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document +0.00028849788197494655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document +0.00021601278623858145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document +0.00021319599281739178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document +0.0002153325290600083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document +0.00018566946174516558 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document +0.00020736824394291617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document +0.00020857419820128004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document +0.00020058526129536423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document +0.00020745812166665217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document +0.00020652171015271702 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document +0.00020643808911278608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document +0.00020040513914482103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document +0.00020598050188272898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document +0.0001969184139343296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document +0.0001972748812937012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document +0.0002038556751586195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document +0.00020245186011313464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document +0.00019950381422038783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document +0.00020837055459665258 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document +0.00020371856218246096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document +0.00019537612301625791 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document +0.00019914984508813857 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document +0.0002053787713691309 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document +0.00019082100541008637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document +0.00020397153334531813 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document +0.0002021462693077317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document +0.00019609357008124035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document +0.00019693256622486236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document +0.00020007239732428112 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document +0.00020467075741591954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document +0.00019584883400022932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document +0.00019135050391176972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document +0.0003362829834208298 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document +0.00034013691154784095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document +0.00033215887031941976 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document +0.00032681189065396707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document +0.0003149138485493094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document +0.00030179177307540077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document +0.0002923278437581119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document +0.00029470052278994486 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document +0.0002994095093045731 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document +0.00029033525096085037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document +0.00029390798852496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document +0.0002916230924130842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document +0.00029419886374594913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document +0.0002865469756730764 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document +0.00021191292549942086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document +0.00021369664817409847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document +0.00021612485624266726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document +0.00022242192634588478 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document +0.00014605095659989698 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document +0.00022070626106341693 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document +0.0002174420774054071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document +0.00021325858963116995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document +0.0002124322999488052 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document +0.0002081218896969054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document +0.0002108710211556957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document +0.00020686867095978426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document +0.00020895752681041895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document +0.00020741922266415738 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document +0.0002069112657197308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document +0.00020644627473468118 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document +0.00020332991338121604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document +0.0003560895677789848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document +0.00032915779111908214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document +0.00033810613317040864 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document +0.00033729626594036923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document +0.00033550342864602944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document +0.00034173474024556906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document +0.000331505340748827 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document +0.0003270050330117195 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document +0.00032585275329172556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document +0.0003143383203190604 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document +0.00031655199110388894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document +0.00030738872158476413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document +0.00030838388352699285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document +0.0003053596995351888 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document +0.00031836304739584593 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document +0.000315315435873905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document +0.0003087116248965243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document +0.00030396790625537645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document +0.0003335812246032149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document +0.00034570956323095843 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document +0.00034563035636675786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document +0.00033411265479076335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document +0.00034439191141692787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document +0.0003364483125496565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document +0.0003299500453608033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document +0.00033163377700074837 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document +0.00032638649660627673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document +0.00032616167939645234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document +0.0003205289298760723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document +0.00031939393740815355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document +0.00031593164066731296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document +0.00031928871111254405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document +0.00029670189073175004 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document +0.00020517703846735904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document +0.00020128418186172073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document +0.00019662723895606717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document +0.0001981157042081407 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document +0.00019703489037041608 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document +0.00019079796331785068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document +0.0001909352306690079 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document +0.00018824662295261396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document +0.00019864275319325954 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document +0.00018818516521649587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document +0.00018875694972812844 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document +0.00018231621170645482 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document +0.00018349407845798273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document +0.00018088971427746906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document +0.00018296284236327237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document +0.0001876011825819916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document +0.000329052068725176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document +0.00032223616273648536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document +0.00031272564089633955 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document +0.00031621609908414494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document +0.0003117213560911235 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document +0.00030218064069945934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document +0.00030658916600512085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document +0.0002915863534115821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document +0.0002940280138374372 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document +0.00029067860468866085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document +0.00028529228063135635 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document +0.00028336893301452256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document +0.0002794668089130099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document +0.00021681361378827842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document +0.0001484664674497246 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document +0.00021950558378215133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document +0.00021806860758808645 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document +0.00021819568718852282 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document +0.00021626925931585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document +0.0001464536143077762 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document +0.00021432777088808917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document +0.000213473805865147 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document +0.00021397067253964538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document +0.00020758957647437263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document +0.00020687124337683314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document +0.00020630057046511005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document +0.0002091166859352538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document +0.00020777355025615267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document +0.00020709287641496176 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document +0.00020736464660577094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document +0.00020062246741862607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document +0.00020693207561942915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document +0.00021151004871893024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document +0.00019930249098689716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document +0.00021589710041231824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document +0.00021369204789905741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document +0.0002147099923936778 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document +0.00021077531190389536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document +0.0002100509829113836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document +0.00021185362601571124 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document +0.00020722136637339565 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document +0.00020300093701169531 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document +0.00019859737993313477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document +0.00019971314372100164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document +0.00019549908270269278 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document +0.00019649820843534028 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document +0.00019619415513498067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document +0.00019493006120377898 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document +0.00019499409035775506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document +0.00019252988593634277 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document +0.00019440768268686405 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document +0.00018747161324755577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document +0.0001879575932372779 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document +0.00019040707058357506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document +0.0001871931095090703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document +0.00020112966223017096 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document +0.00020516878165311017 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document +0.00020664735191740533 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document +0.00021041398572882962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document +0.00020397992929690396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document +0.0002039978580295561 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document +0.00020592785601142126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document +0.0001990755527445265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document +0.00019729564847798732 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document +0.00019958182230527032 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document +0.0001985037302636386 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document +0.00020204130355115716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document +0.0002000296401958085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document +0.0001983064832295463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document +0.00019663108484195617 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document +0.00019510678560556523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document +0.0001873284057063206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document +0.00019311553072495885 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document +0.00034652137288816547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document +0.0002813690318850024 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document +0.00027697649713138685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document +0.0002755419092534421 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document +0.0002681583054440219 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document +0.00026945753192750824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document +0.00026169470768245737 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document +0.00026437008960810825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document +0.0002637294838228 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document +0.00026491867965088836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document +0.00025504483625138986 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document +0.0002545040623796586 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document +0.0002546682814073622 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document +0.00025545439487142615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document +0.0002626896557978271 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document +0.00025092040940402784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document +0.0002589154885863872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document +0.00024106160482721467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document +0.0002483289690087987 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document +0.0002388930282784437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document +0.00024006340759273874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document +0.00023765248178029045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document +0.00023061351965578936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document +0.00024954224883546477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document +0.00017861017233018525 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document +0.00017810832743667658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document +0.00017599709170759497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document +0.00017462723516505223 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document +0.0002906316527068669 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document +0.00033762141066247166 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document +0.00017170670574152494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document +0.00017258674515137717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document +0.0002815386173173926 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document +0.0002996845935618989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document +0.0002735268488987296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document +0.0002971738713071517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document +0.0002942690674002763 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document +0.0003322222207729567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document +0.0003378721656198464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document +0.00018307262621851067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document +0.00033956081502775057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document +0.00031604820927876276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document +0.00028805657681088917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document +0.00026312293321215633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document +0.00034366936722921455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document +0.0002865256504406559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document +0.0003063615195861786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document +0.00028412791619666136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document +0.00028060835132727154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document +0.00032544974761560506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document +0.0002647177833217225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document +0.0003152621884896575 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document +0.0003054625140336913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document +0.00031183308312292263 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document +0.00018175026696621178 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document +0.00017699918328872 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document +0.00018222339261441908 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document +0.00018348005930964137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document +0.0001810735993810541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document +0.00030846441282038914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document +0.0002972326889310354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document +0.00017433421318235594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document +0.00032799458649525895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document +0.00032482130048512673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document +0.00031943465668672475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document +0.00029615593630484517 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document +0.0002893126939511001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document +0.0002849288351723284 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document +0.00028383906633569267 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document +0.00028072526091262615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document +0.000284239564292377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document +0.0002778903109432523 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document +0.0002771644389501471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document +0.0002733316182319337 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document +0.00026362539185869363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document +0.0002636325383220217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document +0.00026740622442302886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document +0.0002646771971853427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document +0.0002628566720605389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document +0.0002644760695434766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document +0.0002623837702310999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document +0.00026088722976772894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document +0.0002567065374799158 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document +0.00018857382101207726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document +0.00019036580399817203 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document +0.00018348828065261222 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document +0.00018491851780345073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document +0.00018904887260080187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document +0.0001875609304251801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document +0.00018393034720015817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document +0.00018419795526114903 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document +0.00018699955623404795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document +0.00018276256902965128 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document +0.00017698045695190812 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document +0.00018104650132303642 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document +0.00017758206731279688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document +0.00017131402995103497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document +0.000175944428350446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document +0.0003416745727147391 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document +0.0003163259373952889 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document +0.0002804489269172448 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document +0.00028748272397403175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document +0.00027603318345630605 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document +0.000271638824679648 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document +0.0002763761210210942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document +0.00026501984873172717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document +0.00026422486894694714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document +0.0002686339100849262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document +0.0002610837453940606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document +0.000260974343729353 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document +0.0002599403837029134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document +0.0002937273113238609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document +0.0003341790732600504 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document +0.0002620661576600244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document +0.0003027929169239288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document +0.00031944039129326894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document +0.00019025676304139009 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document +0.00018680910145009907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document +0.00034215840419416437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document +0.00018618120812119364 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document +0.00018605853095599425 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document +0.00018120712626096538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document +0.00018315079292495327 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document +0.00018362556449041974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document +0.0001780024456718171 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document +0.00033296526436178697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document +0.0001802398632282846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document +0.00017340263100798256 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document +0.00017755840547238697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document +0.00018419413735260606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document +0.00017869518174591322 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document +0.00017526271460129484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document +0.00017852168597981907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document +0.00017566536156787157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document +0.00017589867964432936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document +0.00017831487394075305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document +0.00017837310528935862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document +0.00018200908814216548 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document +0.0001795136627511612 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document +0.0003414021775300033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document +0.00017177291787788502 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document +0.0003441900648571877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document +0.0003394534597060673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document +0.0003236887233114832 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document +0.0001639544129688747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document +0.00019137443753211255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document +0.00018575146284680153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document +0.00019184792863440243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document +0.00018966043065679055 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document +0.00017968851317035848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document +0.00018479881897661546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document +0.0001813642692683015 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document +0.0001686449798983066 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document +0.00018516104592230446 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document +0.00031283726601066385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document +0.0003248607542883853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document +0.00031583241601202365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document +0.00031238270857730376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document +0.000307150592403979 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document +0.00029443829986847044 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document +0.0002942723732234677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document +0.00023514930666443422 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document +0.0020776328951453444 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document +0.0021768234410538883 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document +0.002106973549276289 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document +0.002110915756171751 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document +0.0017032382109816464 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document +0.0019047944877712286 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document +0.0019402711744016077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document +0.0006264790011223686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document +0.0017885401938106643 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document +0.0003547982093445404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document +0.00035934014428504944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document +0.00035707704501371544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document +0.00035287930712815354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document +0.00035977166728996823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document +0.0003581675664109838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document +0.0003548617059697185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document +0.0003639582000286208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document +0.00035375839698688127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document +0.0003743722020080678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document +0.0003530399715341242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document +0.00035511875882752406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document +0.0003618733574783154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document +0.00035185243285420104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document +0.0003541503739732106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document +0.0003631679485751914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document +0.00035748045578182274 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document +0.0003606490690555877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document +0.0003626383296610091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document +0.00035442644361264756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document +0.00035978370170539796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document +0.0003585562375341541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document +0.0003601958372888019 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document +0.000350277765402227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document +0.0003616521184211704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document +0.0003620625543608188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document +0.0003560781983850704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document +0.0003553209610592676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document +0.00035905348643915075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document +0.00034744258805696526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document +0.00035462784035661496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document +0.00034768186175100895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document +0.0003568534635532736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document +0.00035586511544371234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document +0.0003524567827568137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document +0.0003512453770426313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document +0.0003591792726468799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document +0.0003514024529343127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document +0.0003584880112586934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document +0.00035133552916418045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document +0.0003600811981350215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document +0.0003571663974228119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document +0.00035768103378874214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document +0.00035939205561113694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document +0.00035186773916029825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document +0.0003542829672490847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document +0.0003592783642898726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document +0.0003556367340099302 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document +0.00035391392271377027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document +0.00035486725707484836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document +0.00034866743396828035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document +0.0003517219808644735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document +0.00034874458549673823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document +0.000355773136961014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document +0.00035611750387841917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document +0.00035305602013916315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document +0.0003578207127071924 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document +0.00035514635841943707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document +0.00034816946212866206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document +0.0003512707269761496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document +0.0003483392117980654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document +0.0003572169607204321 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document +0.00035139153281660794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document +0.00035536422129036537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document +0.000352017164107143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document +0.000351889550179365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document +0.000358759689953589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document +0.0003569286079869268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document +0.0003657752958602099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document +0.00035396127934790697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document +0.0003618565071224743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document +0.00035146051531973204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document +0.00036107135765783567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document +0.00035019554279994576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document +0.00035567858879904983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document +0.0003504753174793183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document +0.00035931140831329194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document +0.0003502967866002823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document +0.0003532911801041972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document +0.0003583543013070199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document +0.0003566243489931224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document +0.0003468752314799221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document +0.0003597840618138091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document +0.00035128822484768084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document +0.00035889496943437507 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document +0.000352400524650424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document +0.0003518689536768735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document +0.00035866864741303467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document +0.0003454687659106334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document +0.00035348007259317576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document +0.0003539752270940644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document +0.00035146495994081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document +0.00035397212846310423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document +0.00035208246467162587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document +0.0003490843168676626 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document +0.00035299633658644394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document +0.00034868327466167065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document +0.00035941351365601583 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document +0.0003545343062735255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document +0.0003528956380445978 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document +0.0003553355770443352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document +0.0003644224004937743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document +0.00035234291036216907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document +0.0003596237469847771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document +0.0003531996065735989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document +0.0003547177054106099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document +0.0003575586499260483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document +0.00035262635135283667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document +0.0003624191962188944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document +0.0003488398052948616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document +0.0003598294093147917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document +0.00035583006534466323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document +0.00035403139653225103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document +0.00036134702642187156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document +0.0003573689927162834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document +0.0003577141131435527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document +0.00035208814419277406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document +0.00035996720683665625 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document +0.00035415304658912596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document +0.00036353353029443546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document +0.0003537326003150983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document +0.00036053976358299083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document +0.000352380489373494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document +0.00036154661616900994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document +0.00035959332325963614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document +0.0003597954667189692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document +0.0003563108270597542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document +0.0003582891940460143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document +0.0003497728210484297 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document +0.0003549834902179354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document +0.0003529828233484542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document +0.00034627483903285777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document +0.00035569006572589215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document +0.00035449377946910314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document +0.00035802844396194623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document +0.0003617277809353208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document +0.00035034118898654814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document +0.000351091193908611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document +0.0003527914342210668 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document +0.00035028288369781376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document +0.00035775745592780506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document +0.0003449630690661468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document +0.0003583490698830361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document +0.0003476995746684122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document +0.0003535632505019212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document +0.00035640180641147417 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document +0.000361731045691765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document +0.0003534082129597368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document +0.0003550344149828664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document +0.00035363002411364057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document +0.0003537265579677396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document +0.00034950531383577937 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document +0.00035008511827347514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document +0.00035594533400871325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document +0.00035266312861335946 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document +0.00035280268794863923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document +0.0003565470391528536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document +0.0003588492322689137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document +0.00035469909697832775 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document +0.00034712082813410526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document +0.000348701157101807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document +0.0003500192014479944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document +0.00035120560544669755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document +0.00035403656850437445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document +0.00035852376560749366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document +0.0003534754068111774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document +0.00035591740046720765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document +0.000348522354782563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document +0.0003533533959664415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document +0.00035631425964030697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document +0.0003485886551574741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document +0.00035917652631065777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document +0.0003482975272111288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document +0.00035580661277480167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document +0.0003492290722955348 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document +0.00034989284450240613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document +0.0003545677216162781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document +0.00034622286859463484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document +0.00036070626989861965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document +0.00035518365036320786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document +0.00035272907057848406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document +0.0003547343638218734 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document +0.0003496450144966242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document +0.0003537407829294287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document +0.0003489722653985685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document +0.00035057186899911295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document +0.0003507566548933051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document +0.00035630360179023747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document +0.00035631362503416367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document +0.0003490204248026821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document +0.00035761724058371226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document +0.00035037664777467137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document +0.000353402110481068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document +0.00034524163568371745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document +0.00035528523728570974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document +0.00034784916132431703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document +0.00034928476408048925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document +0.00034989205973784984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document +0.00034201664404094254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document +0.0003529676016338611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document +0.00034643433682346637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document +0.0003511666373001904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document +0.00034828669066575333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document +0.0003494625207264413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document +0.0003458957535879216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document +0.0003543020478990003 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document +0.00034754384069014956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document +0.0003598856392240133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document +0.0003503335458553846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document +0.00035919595619778716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document +0.00035767737970754404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document +0.00035197152783998165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document +0.0003549609834422404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document +0.0003568184100569753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document +0.0003512652818651935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document +0.00035912648958665754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document +0.00034764526964056546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document +0.000352439784960359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document +0.00035295886560764226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document +0.0003518132693658672 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document +0.00035589987915465713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document +0.00034923863317385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document +0.0003457987267929692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document +0.0003560928663480501 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document +0.0003529603811204932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document +0.0003524438555443043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document +0.0003438847030263783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document +0.00035981978898461613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document +0.0003446342778566972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document +0.00035529584995236537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document +0.00034855740895831116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document +0.00034932634912802544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document +0.00035805518303064666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document +0.0003497941877073061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document +0.00035774398685405447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document +0.0003560421780316607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document +0.0003508844468369392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document +0.00035731928892270107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document +0.0003557884626314314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document +0.00034992996760289355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document +0.000360752554360921 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document +0.0003452321668708545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document +0.0003591745226131023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document +0.00035256981433229084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document +0.00035378123159712034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document +0.000350464354895999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document +0.00035074625557389677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document +0.00035025894701994667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document +0.00035437902514857614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document +0.0003514684519732232 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document +0.00035449717909633905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document +0.0003436816402714221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document +0.00035139158071782116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document +0.0003509424079843335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document +0.000343894618577506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document +0.0003500789770661659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document +0.0003407788080680086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document +0.0003581908175239701 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document +0.0003465541618780918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document +0.00034600228792437736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document +0.00034416738982773204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document +0.0003519900340150641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document +0.000343369616864659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document +0.0003544993883274688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document +0.0003504441365073392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document +0.00034859160702727056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document +0.00035355909532647185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document +0.0003471900922691849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document +0.0003563015508709187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document +0.0003487888744148821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document +0.00034711767548688336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document +0.0003530734609369085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document +0.00035123969242560935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document +0.0003517127620891489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document +0.00035232835416868673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document +0.0003524437481912308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document +0.0003525996167005602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document +0.00035064770545242043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document +0.00035311558274981226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document +0.00034952204800569914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document +0.0003541471367344846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document +0.00035418812454561825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document +0.0003528951372900714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document +0.0003542338042975688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document +0.00034937738939942796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document +0.0003522182190878447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document +0.0003501406466507449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document +0.00034973079877492633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document +0.0003485274567713538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document +0.00034999308679368985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document +0.0003570051724707296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document +0.00034567230462019706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document +0.00035529000940160696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document +0.00034956512308671755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document +0.0003496962834028953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document +0.0003468745282493457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document +0.0003502717155809202 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document +0.0003556240880896514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document +0.0003515109488424343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document +0.0003563156688192592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document +0.00035040277363989817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document +0.0003481408593290717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document +0.0003624575124332874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document +0.0003522684124250313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document +0.00035286996027653544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document +0.00034967623997256725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document +0.00035182649587602765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document +0.0003524892557026489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document +0.0003507642477451811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document +0.00036190408389835666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document +0.00035102739424880766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document +0.00035239718753257265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document +0.00035298076121821316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document +0.0003478704389752654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document +0.0003503109191567942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document +0.00035143250975654426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document +0.0003480663923069012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document +0.00035691540219998623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document +0.000348815437166351 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document +0.00035202073257766225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document +0.0003491569096274706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document +0.00035277390475511834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document +0.0003524972090026609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document +0.0003504854249750236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document +0.00034740238025423914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document +0.00034968015462277606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document +0.0003493798632762674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document +0.0003488202537862122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document +0.0003525461864643725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document +0.00034903815232825664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document +0.00035536982539258216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document +0.00034858083265155483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document +0.0003505014973608067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document +0.00035327984042622104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document +0.0003503286677453136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document +0.00035835274842442816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document +0.00034970302660275595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document +0.000357929573140149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document +0.0003517238649788585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document +0.00036097027318848475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document +0.0003502734074110026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document +0.00035801510806036273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document +0.0003568006373479869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document +0.00036128108717454636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document +0.0003563436883111686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document +0.00035559725321852463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document +0.00035089656006854944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document +0.000359453964362057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document +0.00035629498059104033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document +0.0003622207707090437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document +0.0003540946784512821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document +0.0003594750565232011 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document +0.0003566007415086991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document +0.0003562142599126134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document +0.0003569948186744601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document +0.00035166554847920186 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document +0.00035047994419295137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document +0.0003561578193739437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document +0.00035470866838811544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document +0.00034216920464876335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document +0.0003550021513075795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document +0.0003488045105938729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document +0.0003513340720840151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document +0.0003448558566387584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document +0.0003460966026953241 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document +0.0003488157616036459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document +0.0003446120387842362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document +0.000351528602987427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document +0.00035661118227454713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document +0.0003551342699877457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document +0.0003478953397924445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document +0.00034625782458988215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document +0.0003527515447405871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document +0.00034823744889805696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document +0.00034823314560254406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document +0.00035162668292961944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document +0.0003477307716074623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document +0.0003446457989477787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document +0.00034782916273767795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document +0.0003517249130302248 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document +0.0003449873430908556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document +0.00034841291749669877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document +0.0003466028498941749 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document +0.0003486436831199424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document +0.0003478279234211838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document +0.0003495903653274374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document +0.00034896893881218957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document +0.000348941645312426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document +0.0003474221308416894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document +0.0003462621543839385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document +0.0003669373860863891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document +0.00034691156268163006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document +0.0003527774103765281 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document +0.00034684565672734663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document +0.0003454250599604457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document +0.0003541536557159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document +0.000345735737037366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document +0.0003524669816385214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document +0.0003441817133096468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document +0.0003519093265859089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document +0.00035080085480352095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document +0.00035285227929327434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document +0.00034354836346901676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document +0.00034789770937373467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document +0.000343665920520102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document +0.0003490884931060568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document +0.00034380029463398654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document +0.00034874768005099945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document +0.0003457058510967673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document +0.00034644265227023904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document +0.00035008339858594957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document +0.0003462377193296194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document +0.0003620491787114201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document +0.000348717011044469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document +0.00034370072363913706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document +0.0003551981066775649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document +0.0003500119496799342 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document +0.0003485082952669081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document +0.0003508155580978919 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document +0.00035311375163251416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document +0.00034945972003423253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document +0.0003474220353789879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document +0.0003536443686585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document +0.0003560350489042953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document +0.0003493655927914396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document +0.0003528423977146383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document +0.00035255554724471217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document +0.0003479760010190111 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document +0.00035458598862501956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document +0.0003458990560538315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document +0.00035157946422379875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document +0.00034736860650169996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document +0.0003529152313394119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document +0.00034586294329524465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document +0.00035707214923794877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document +0.0003509580363496512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document +0.00035244176725524474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document +0.0003467539557999047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document +0.00034919687962275546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document +0.00035094031731719953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document +0.0003484309008351352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document +0.0003485409424916253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document +0.0003499590776117838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document +0.0003492842758957848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document +0.0003529712275178912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document +0.0003566141287087449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document +0.0003649496522047409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document +0.0003563218912208234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document +0.00035614782126966145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document +0.0003531944298453266 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document +0.0003535950949566616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document +0.0003544295554928795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document +0.0003519908503740376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document +0.00035752817626134463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document +0.0003515322689589972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document +0.0003486893890307115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document +0.0003446520464889867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document +0.0003509421562481707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document +0.00035335015702909084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document +0.0003490178167345008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document +0.0003520497821155174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document +0.0003549762618908944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document +0.00035072190850833103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document +0.0003542458638526423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document +0.000352419194572916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document +0.0003545102564672614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document +0.0003495437992331806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document +0.0003542843376993964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document +0.000352827529313958 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document +0.00035442506093223886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document +0.0003496970719044257 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document +0.0003553096424442362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document +0.00034986845565067564 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document +0.000352131055186658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document +0.0003527021708198983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document +0.00034905885414547214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document +0.0003583433842468394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document +0.00034409435202828383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document +0.00034846410520871483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document +0.0003554459991927314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document +0.00035310507471843076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document +0.000350028910786098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document +0.00035049727458009896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document +0.0003519047735925826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document +0.0003513027429919726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document +0.0003626947260354396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document +0.0003500087324849783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document +0.0003618315726725285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document +0.0003535385113938023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document +0.0003487064058517615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document +0.0003618709124780938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document +0.00035040070335625915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document +0.0003506279032267829 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document +0.0003498435310527524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document +0.0003554634749821431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document +0.00035091209738758963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document +0.00035034103678978573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document +0.00035398931854386146 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document +0.00035495529304989485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document +0.00036067883473356603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document +6.322825248625475e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document +2.4432314037946264e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document +5.6313888721313454e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document +2.4208171781595055e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document +2.325811856369237e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document +2.4010790356322705e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document +5.36773610843632e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document +1.360574433501002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document +1.3076540344853244e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document +1.3386534334886313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document +1.2498103719605153e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document +1.403763836949682e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document +1.3636756723495417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document +1.2242489446940814e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document +1.2398255818973339e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document +1.2972616994216281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document +1.3947809855914134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document +1.3144843787829514e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document +1.1693809976572487e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document +1.3677252682893802e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document +1.3940876719849597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document +1.4222245138730965e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document +1.3201677767919704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document +1.1421717796486169e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document +1.2890514724498703e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document +1.3649507648749037e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document +1.2400732563490717e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document +1.1557681453277616e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document +1.2294483595964517e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document +1.2137484472122283e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document +1.3299663426456e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document +1.2461984216479532e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document +1.4666434217609636e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document +1.1876997894686238e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document +1.2939155338964078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document +1.3859590039728515e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document +1.317917848615668e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document +1.1335281536110342e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document +1.2889923952861426e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document +1.3471671647053326e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document +1.2221720014475102e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document +1.2632647276287541e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document +1.28276219004076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document +1.36213704321643e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document +1.2414858625261553e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document +1.3173700421883744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document +1.295597796725686e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document +1.242783936442904e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document +1.2417374088427464e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document +1.2134479405400744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document +1.3090040663304255e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document +1.2713470581614905e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document +5.5750231378906594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document +5.777597358425469e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document +5.349786767471258e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document +5.675165050453583e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document +5.482611216158831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document +5.065421899890121e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document +5.384718357480146e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document +4.872037363236061e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document +4.532709250783155e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document +5.7257963030489613e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document +4.9014365579652036e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document +5.722863552770969e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document +6.149911636146833e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document +5.2178057608273506e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document +4.990228161160431e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document +5.866186875255134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document +5.004185734360719e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document +4.79401853705107e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document +5.435219965052376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document +5.035997225792266e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document +5.622401774211625e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document +5.028826157387559e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document +5.596379470128795e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document +6.027824493191489e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document +5.5358270009931474e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document +5.9839051807685496e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document +5.1221077499249595e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document +5.517228560620279e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document +5.1687858285052305e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document +5.684188244145645e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document +5.212693275535878e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document +4.8551007022784084e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document +5.4888506639203145e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document +5.345098688527242e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document +4.8506420625516594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document +5.132168603397676e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document +5.719476795114223e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document +5.7448621149792696e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document +4.9068410568059265e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document +5.382937299647678e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document +4.8288432136304634e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document +5.841703200305416e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document +5.1589611587885584e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document +6.031113829732574e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document +5.4558202844532094e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document +5.341852317196142e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document +5.1402942738369954e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document +5.735421384377395e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document +5.473629863586958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document +5.4708993245733936e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document +4.931161863634078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document +5.104173022127248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document +5.510157161510824e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document +5.652501401782597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document +5.7273656573031666e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document +5.638363224821738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document +5.6128115396668704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document +5.00304877998141e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document +5.596120554779096e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document +5.5280923889040006e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document +5.223477917938408e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document +5.29472809986569e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document +2.205682378243213e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document +1.4367563720603185e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document +3.5506193487931076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document +3.0442910855821778e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document +2.2540042508019627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document +2.6880163202623216e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document +2.534473148048727e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document +2.6560945431318916e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document +2.547470248967691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document +2.5248825388073738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document +2.5828729575000054e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document +2.4026583817957736e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document +2.3930425429834413e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document +2.5037365362599724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document +2.6696745470595603e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document +2.140323051341762e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document +2.617354786691592e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document +1.538359101762691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document +1.2871029252377856e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document +2.255195411289217e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document +2.4832313897952067e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document +9.303873918189968e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document +2.179532302620228e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document +1.9750517506901206e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document +2.7740420380648435e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document +2.7813714782319335e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document +4.1595357937609806e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document +2.741365122389175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document +2.117451071361901e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document +1.7132649760565998e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document +1.7492547092602047e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document +1.7499951097392276e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document +1.6632444789170958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document +1.6678802252361607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document +1.5519208704558896e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document +1.652420992967167e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document +1.6119931034508755e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document +1.6638882076736552e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document +1.7198076782652946e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document +1.572927860565175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document +1.5194822618169918e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document +1.6677776832669846e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document +1.595612492245688e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document +1.682350633181197e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document +1.663983380609724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document +1.710187842689243e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document +1.5733697527539038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document +1.6972104757911438e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document +1.6610142847616577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document +1.61094882403031e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document +1.4789207305138325e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document +1.639299617676302e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document +1.3241204512116132e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document +8.582260726625535e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document +8.213000975576739e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document +9.549247732811947e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document +9.17242785339013e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document +7.632868223725218e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document +8.674401118222175e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document +9.124384255505347e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document +8.344222222417358e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document +8.992299957499065e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document +8.76689497361025e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document +7.973396239586015e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document +9.006935606644125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document +8.725545954955498e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document +1.215449694669174e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document +3.3041720284158646e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document +2.0593512412624502e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document +1.893608946986248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document +1.737111666788535e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document +1.4915923449873955e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document +2.289370239067605e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document +2.8615335689614638e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document +8.847283630883125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document +1.8175470362373804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document +1.8152226683368038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document +1.789149655314284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document +1.7690523036477663e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document +1.8333732213753644e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document +1.8794105687718654e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document +1.721841156706417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document +2.0612008685724796e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document +1.9297370681336376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document +2.0188440409661018e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document +5.1741216329695265e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document +1.3417913926038429e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document +1.1010813016469651e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document +1.1252416134320087e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document +1.2801744104313002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document +1.3041514955795817e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document +1.3428837580879075e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document +1.320809382267804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document +1.3451566676555968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document +1.228284926657501e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document +1.2410599573923043e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document +1.3815343367377182e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document +1.3895126265148832e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document +1.2306773644401741e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document +1.32981021906281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document +1.101337469221607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document +1.513094184404692e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document +1.1073759547073234e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document +1.2879348765857567e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document +9.619595770228435e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document +1.2384340836286436e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document +1.1766667232211577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document +1.2871049236196452e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document +1.2010645926497744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document +1.3971428231518597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document +1.2283733550547932e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document +1.2659530508255308e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document +1.551775613074462e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document +1.1169413343776979e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document +1.1433700593712463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document +4.964773647323492e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document +1.0995586595687313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document +1.2957393071411267e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document +2.75899247407709e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document +2.8269344597344854e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document +2.329108187246831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document +2.4231761430460284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document +1.2434140512230442e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document +1.638718338352859e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document +3.272953556801187e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document +6.061314500486327e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document +1.2465979731210292e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document +1.2737557327967737e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document +1.038428658075627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document +2.61666472045566e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document +3.6506873212272224e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document +1.5066359138295701e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document +1.1166290872121178e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document +1.5546966228590285e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document +1.2583434625014828e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document +1.3398826881300862e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document +1.2944933160515968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document +1.0971437399901365e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document +1.2787922795775774e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document +1.404979227816985e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document +1.3344734431324463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document +4.886031157107555e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document +3.277261443596394e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document +3.5057957685786495e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document +3.287625301718589e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document +3.1370056372668855e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document +3.186092015785841e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document +7.271819324142512e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document +0.001451215788905126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document +0.0014486847196258788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document +0.0008861032722895899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document +0.0018119590809459816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document +0.0008916937917547129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document +6.960128832809415e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document +0.002008403651063623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document +0.0014374900742131454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document +0.00180213596996716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document +0.001956178877532413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document +0.0008829547017667033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document +0.0008910853619157279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document +0.0018260998845299973 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document +0.0012499632072059553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document +0.00125398260359913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document +0.0012541704774729071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document +0.0012527268234360602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document +0.0012532925243737164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document +0.0012456396241204315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document +0.0012589894424352072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document +0.001508020123999618 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document +0.00333096950781965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document +0.0033233414614415547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document +0.003512387990689828 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document +0.0035091382940513126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document +0.003514155927147005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document +0.003327108000579638 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document +0.003329106196589836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document +0.003505604148738077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document +0.003324825759567855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document +0.0033248240149804913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document +0.0033385962112851358 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document +0.0035043186296553615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document +0.003340469505431529 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document +0.0035106889084796276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document +0.0033309469281030167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document +0.003340337858029757 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document +0.003505919861097801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document +0.0003882924098240512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document +0.0005759963691850877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document +0.0005959971675332674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document +0.0006026179290353799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document +0.0005824184320784846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document +0.0005854598548616037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document +0.0005903767055633473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document +0.0005930306490982049 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document +0.000569425602700746 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document +0.0005675060415179408 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document +0.0005772431621253389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document +0.0005678026053826858 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document +0.0005700398263483378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document +0.0005669467963528824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document +0.0005701015953324305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document +0.0005795907287413296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document +0.0005735602737531164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document +0.0005749862745842101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document +0.0005693257015931971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document +0.0005716568794795563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document +0.0005761083919774021 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document +0.0005688343169797355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document +0.0005807913190929842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document +0.0005710229258078636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document +0.0005704083039826862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document +0.0005862132348308056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document +0.0005717662049559556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document +0.0005858155213694451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document +0.0005812012281792392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document +0.0005803981414588498 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document +0.0005700102108287723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document +0.0005719243459052329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document +0.0005867253401661752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document +0.0005731087218860733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document +0.0005712197789109317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document +0.0005702376926310089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document +0.0005700411527742972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document +0.0005828090098178196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document +0.0005770140826168056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document +0.0005723509664597896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document +0.0005755499231836962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document +0.0005636407438471367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document +0.0005640281556500104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document +0.0005633159058766496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document +0.0005638034311151449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document +0.0005630066273073224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document +0.0005631803831128559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document +0.0005631228881679657 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document +0.0005628178701487633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document +0.0005624448092256196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document +0.0005620957024062329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document +0.0005614201504177484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document +0.0005616890951464056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document +0.0005611348559279058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document +0.0005604238061828518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document +0.0005603301490194237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document +0.0005607291294548833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document +0.0005605234569930727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document +0.0005613778566640694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document +0.0005610248539992471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document +0.0005599977416780475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document +0.0005603632562116935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document +0.0005599177479509897 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document +0.0005595202318298379 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document +0.0005600975633499175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document +0.0005614075491213365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document +0.000612563885043477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document +0.0005515469909644413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document +0.0005526782014946906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document +0.0005472463408095445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document +0.0005502284746004587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document +0.0005414514790555363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document +0.0005513499500134784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document +0.0005391391454105187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document +0.0005415836910001838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document +0.0005208132468536551 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document +0.0005889827143132871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document +0.0005822520817765276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document +0.0004173155230758696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document +0.0009994361338078242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document +0.001087156194657966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document +0.0010667737163656816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document +0.0009602877882124873 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document +0.0008968956271971105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document +0.0009198034843762967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document +0.0009423901016715341 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document +0.0009674094553686345 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document +0.0009858331322519164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document +0.0009970593645879198 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document +0.0010027035193731686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document +0.0010128291154221853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document +0.0010215631382631918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document +0.0010288663771461238 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document +0.0010346219929285867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document +0.00104544019940344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document +0.0010525172676724333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document +0.0010609529620775127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document +0.0010725892748610153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document +0.0010818563598181568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document +0.0010992760196793917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document +0.0011178992762079917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document +0.001124687532085676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document +0.001118303661267191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document +0.0010206825575416534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document +0.0005512280117499715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document +0.004474659408857016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document +0.00409944473890653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document +0.005137179939941845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document +0.005143172251066109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document +0.005206134363352808 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document +0.004892747858974329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document +0.004844731352552902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document +0.005308320169123755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document +0.005124709815666577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document +0.005424710744483826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document +0.00538244648861977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document +0.0029107284679086853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document +0.0026825258998444705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document +0.0026904503191419243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document +0.002687906577174073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document +0.002850165346048818 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document +0.005322698571717847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document +0.004450334290869719 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document +0.004700990083440683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document +0.003903568556500995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document +0.00390561515396931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document +0.0039046402900912262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document +0.003907454839379547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document +0.0038583224578603824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document +0.0037914116657695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document +0.003786665266798682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document +0.003792000802430658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document +0.00319266847466091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document +0.0032658716699838944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document +0.0034801959532460023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document +0.0028307012092022594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document +0.0028420360878146276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document +0.0028410455248484914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document +0.00283497183526842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document +0.002840187195459487 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document +0.0028398709431369834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document +0.004364722843422023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document +0.004093255713117101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document +0.004092331079566252 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document +0.004005326985579649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document +0.0036205502856964207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document +0.003625316793034984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document +0.003604743435602363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document +0.0035405823343673125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document +0.0041601413517253945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document +0.005886303658937057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document +0.003600909532810332 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document +0.0034941365817168658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document +0.0004992164842980224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document +0.00032927705604725614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document +0.0002860154190878753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document +0.0002845217585425619 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document +0.0002743528685497456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document +0.00026025323737738766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document +0.00023493876414603155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document +0.00029665994994226705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document +0.00031808102075993956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document +0.00031813573046011285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document +0.0002711905171855542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document +0.00028892513401817095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document +0.00030003908676979083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document +0.00026839878771944684 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document +0.00029155935002690497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document +0.0002998624927624209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document +0.0003091705447974841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document +0.00026873195794309786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document +0.00027721873498527547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document +0.0002841662554024377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document +0.0002839461156551537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document +0.0002861705604659811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document +0.0002460995649635886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document +0.00019420142619795496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document +0.00021967677816173628 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document +0.0002620283200480949 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document +0.0002433390542188936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document +0.00021254976608350767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document +0.00022094815569522115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document +0.000342862378668244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document +0.00033784225259118157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document +0.0003367278459543952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document +0.00029843279042852765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document +0.0002926583661257988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document +0.00029320337282010673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document +0.00029281450669483455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document +0.0002915338187002653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document +0.0002864226923084572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document +0.00028643439083586396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document +0.00028253710956299054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document +0.0002810856078805806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document +0.00031474941344656715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document +0.0002139130222205655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document +0.0003084648871862831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document +0.0003309477872140129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document +0.0003360096824695161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document +0.0003355452655196557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document +0.00038119390366386037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document +0.00038078927630086064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document +0.0003386200917551554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document +0.0002158905159938882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document +0.00021621682877018768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document +0.00021553306942740535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document +0.00021581563462722296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document +0.0002157694110556169 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document +0.000215643699847159 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document +0.00021532716715168094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document +0.00021531221326022472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document +0.0002831801179028896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document +0.0002514844936507595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document +0.00031638782778107964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document +0.0002749197545278445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document +0.00026159721512464495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document +0.0002630052420096968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document +0.00031106811228913666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document +0.0002852973415334161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document +3.7555372465932136e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document +0.003548077173506675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document +0.0018372203137874265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document diff --git a/ALCF/data-lists/polaris/falcon.txt b/ALCF/data-lists/polaris/falcon.txt new file mode 100644 index 0000000000..68aeb2f27b --- /dev/null +++ b/ALCF/data-lists/polaris/falcon.txt @@ -0,0 +1,501 @@ +0.0003547982093445404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document +0.00035934014428504944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document +0.00035707704501371544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document +0.00035287930712815354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document +0.00035977166728996823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document +0.0003581675664109838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document +0.0003548617059697185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document +0.0003639582000286208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document +0.00035375839698688127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document +0.0003743722020080678 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document +0.0003530399715341242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document +0.00035511875882752406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document +0.0003618733574783154 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document +0.00035185243285420104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document +0.0003541503739732106 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document +0.0003631679485751914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document +0.00035748045578182274 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document +0.0003606490690555877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document +0.0003626383296610091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document +0.00035442644361264756 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document +0.00035978370170539796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document +0.0003585562375341541 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document +0.0003601958372888019 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document +0.000350277765402227 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document +0.0003616521184211704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document +0.0003620625543608188 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document +0.0003560781983850704 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document +0.0003553209610592676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document +0.00035905348643915075 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document +0.00034744258805696526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document +0.00035462784035661496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document +0.00034768186175100895 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document +0.0003568534635532736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document +0.00035586511544371234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document +0.0003524567827568137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document +0.0003512453770426313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document +0.0003591792726468799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document +0.0003514024529343127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document +0.0003584880112586934 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document +0.00035133552916418045 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document +0.0003600811981350215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document +0.0003571663974228119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document +0.00035768103378874214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document +0.00035939205561113694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document +0.00035186773916029825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document +0.0003542829672490847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document +0.0003592783642898726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document +0.0003556367340099302 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document +0.00035391392271377027 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document +0.00035486725707484836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document +0.00034866743396828035 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document +0.0003517219808644735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document +0.00034874458549673823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document +0.000355773136961014 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document +0.00035611750387841917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document +0.00035305602013916315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document +0.0003578207127071924 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document +0.00035514635841943707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document +0.00034816946212866206 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document +0.0003512707269761496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document +0.0003483392117980654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document +0.0003572169607204321 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document +0.00035139153281660794 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document +0.00035536422129036537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document +0.000352017164107143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document +0.000351889550179365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document +0.000358759689953589 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document +0.0003569286079869268 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document +0.0003657752958602099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document +0.00035396127934790697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document +0.0003618565071224743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document +0.00035146051531973204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document +0.00036107135765783567 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document +0.00035019554279994576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document +0.00035567858879904983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document +0.0003504753174793183 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document +0.00035931140831329194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document +0.0003502967866002823 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document +0.0003532911801041972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document +0.0003583543013070199 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document +0.0003566243489931224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document +0.0003468752314799221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document +0.0003597840618138091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document +0.00035128822484768084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document +0.00035889496943437507 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document +0.000352400524650424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document +0.0003518689536768735 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document +0.00035866864741303467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document +0.0003454687659106334 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document +0.00035348007259317576 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document +0.0003539752270940644 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document +0.00035146495994081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document +0.00035397212846310423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document +0.00035208246467162587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document +0.0003490843168676626 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document +0.00035299633658644394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document +0.00034868327466167065 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document +0.00035941351365601583 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document +0.0003545343062735255 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document +0.0003528956380445978 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document +0.0003553355770443352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document +0.0003644224004937743 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document +0.00035234291036216907 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document +0.0003596237469847771 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document +0.0003531996065735989 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document +0.0003547177054106099 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document +0.0003575586499260483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document +0.00035262635135283667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document +0.0003624191962188944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document +0.0003488398052948616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document +0.0003598294093147917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document +0.00035583006534466323 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document +0.00035403139653225103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document +0.00036134702642187156 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document +0.0003573689927162834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document +0.0003577141131435527 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document +0.00035208814419277406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document +0.00035996720683665625 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document +0.00035415304658912596 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document +0.00036353353029443546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document +0.0003537326003150983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document +0.00036053976358299083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document +0.000352380489373494 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document +0.00036154661616900994 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document +0.00035959332325963614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document +0.0003597954667189692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document +0.0003563108270597542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document +0.0003582891940460143 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document +0.0003497728210484297 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document +0.0003549834902179354 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document +0.0003529828233484542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document +0.00034627483903285777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document +0.00035569006572589215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document +0.00035449377946910314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document +0.00035802844396194623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document +0.0003617277809353208 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document +0.00035034118898654814 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document +0.000351091193908611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document +0.0003527914342210668 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document +0.00035028288369781376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document +0.00035775745592780506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document +0.0003449630690661468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document +0.0003583490698830361 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document +0.0003476995746684122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document +0.0003535632505019212 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document +0.00035640180641147417 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document +0.000361731045691765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document +0.0003534082129597368 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document +0.0003550344149828664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document +0.00035363002411364057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document +0.0003537265579677396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document +0.00034950531383577937 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document +0.00035008511827347514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document +0.00035594533400871325 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document +0.00035266312861335946 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document +0.00035280268794863923 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document +0.0003565470391528536 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document +0.0003588492322689137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document +0.00035469909697832775 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document +0.00034712082813410526 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document +0.000348701157101807 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document +0.0003500192014479944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document +0.00035120560544669755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document +0.00035403656850437445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document +0.00035852376560749366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document +0.0003534754068111774 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document +0.00035591740046720765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document +0.000348522354782563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document +0.0003533533959664415 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document +0.00035631425964030697 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document +0.0003485886551574741 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document +0.00035917652631065777 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document +0.0003482975272111288 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document +0.00035580661277480167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document +0.0003492290722955348 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document +0.00034989284450240613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document +0.0003545677216162781 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document +0.00034622286859463484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document +0.00036070626989861965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document +0.00035518365036320786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document +0.00035272907057848406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document +0.0003547343638218734 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document +0.0003496450144966242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document +0.0003537407829294287 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document +0.0003489722653985685 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document +0.00035057186899911295 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document +0.0003507566548933051 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document +0.00035630360179023747 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document +0.00035631362503416367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document +0.0003490204248026821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document +0.00035761724058371226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document +0.00035037664777467137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document +0.000353402110481068 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document +0.00034524163568371745 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document +0.00035528523728570974 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document +0.00034784916132431703 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document +0.00034928476408048925 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document +0.00034989205973784984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document +0.00034201664404094254 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document +0.0003529676016338611 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document +0.00034643433682346637 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document +0.0003511666373001904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document +0.00034828669066575333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document +0.0003494625207264413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document +0.0003458957535879216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document +0.0003543020478990003 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document +0.00034754384069014956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document +0.0003598856392240133 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document +0.0003503335458553846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document +0.00035919595619778716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document +0.00035767737970754404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document +0.00035197152783998165 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document +0.0003549609834422404 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document +0.0003568184100569753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document +0.0003512652818651935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document +0.00035912648958665754 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document +0.00034764526964056546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document +0.000352439784960359 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document +0.00035295886560764226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document +0.0003518132693658672 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document +0.00035589987915465713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document +0.00034923863317385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document +0.0003457987267929692 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document +0.0003560928663480501 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document +0.0003529603811204932 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document +0.0003524438555443043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document +0.0003438847030263783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document +0.00035981978898461613 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document +0.0003446342778566972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document +0.00035529584995236537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document +0.00034855740895831116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document +0.00034932634912802544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document +0.00035805518303064666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document +0.0003497941877073061 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document +0.00035774398685405447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document +0.0003560421780316607 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document +0.0003508844468369392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document +0.00035731928892270107 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document +0.0003557884626314314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document +0.00034992996760289355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document +0.000360752554360921 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document +0.0003452321668708545 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document +0.0003591745226131023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document +0.00035256981433229084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document +0.00035378123159712034 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document +0.000350464354895999 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document +0.00035074625557389677 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document +0.00035025894701994667 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document +0.00035437902514857614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document +0.0003514684519732232 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document +0.00035449717909633905 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document +0.0003436816402714221 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document +0.00035139158071782116 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document +0.0003509424079843335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document +0.000343894618577506 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document +0.0003500789770661659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document +0.0003407788080680086 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document +0.0003581908175239701 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document +0.0003465541618780918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document +0.00034600228792437736 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document +0.00034416738982773204 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document +0.0003519900340150641 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document +0.000343369616864659 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document +0.0003544993883274688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document +0.0003504441365073392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document +0.00034859160702727056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document +0.00035355909532647185 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document +0.0003471900922691849 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document +0.0003563015508709187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document +0.0003487888744148821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document +0.00034711767548688336 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document +0.0003530734609369085 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document +0.00035123969242560935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document +0.0003517127620891489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document +0.00035232835416868673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document +0.0003524437481912308 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document +0.0003525996167005602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document +0.00035064770545242043 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document +0.00035311558274981226 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document +0.00034952204800569914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document +0.0003541471367344846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document +0.00035418812454561825 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document +0.0003528951372900714 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document +0.0003542338042975688 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document +0.00034937738939942796 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document +0.0003522182190878447 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document +0.0003501406466507449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document +0.00034973079877492633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document +0.0003485274567713538 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document +0.00034999308679368985 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document +0.0003570051724707296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document +0.00034567230462019706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document +0.00035529000940160696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document +0.00034956512308671755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document +0.0003496962834028953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document +0.0003468745282493457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document +0.0003502717155809202 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document +0.0003556240880896514 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document +0.0003515109488424343 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document +0.0003563156688192592 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document +0.00035040277363989817 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document +0.0003481408593290717 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document +0.0003624575124332874 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document +0.0003522684124250313 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document +0.00035286996027653544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document +0.00034967623997256725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document +0.00035182649587602765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document +0.0003524892557026489 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document +0.0003507642477451811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document +0.00036190408389835666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document +0.00035102739424880766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document +0.00035239718753257265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document +0.00035298076121821316 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document +0.0003478704389752654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document +0.0003503109191567942 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document +0.00035143250975654426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document +0.0003480663923069012 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document +0.00035691540219998623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document +0.000348815437166351 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document +0.00035202073257766225 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document +0.0003491569096274706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document +0.00035277390475511834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document +0.0003524972090026609 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document +0.0003504854249750236 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document +0.00034740238025423914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document +0.00034968015462277606 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document +0.0003493798632762674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document +0.0003488202537862122 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document +0.0003525461864643725 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document +0.00034903815232825664 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document +0.00035536982539258216 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document +0.00034858083265155483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document +0.0003505014973608067 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document +0.00035327984042622104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document +0.0003503286677453136 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document +0.00035835274842442816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document +0.00034970302660275595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document +0.000357929573140149 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document +0.0003517238649788585 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document +0.00036097027318848475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document +0.0003502734074110026 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document +0.00035801510806036273 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document +0.0003568006373479869 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document +0.00036128108717454636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document +0.0003563436883111686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document +0.00035559725321852463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document +0.00035089656006854944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document +0.000359453964362057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document +0.00035629498059104033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document +0.0003622207707090437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document +0.0003540946784512821 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document +0.0003594750565232011 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document +0.0003566007415086991 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document +0.0003562142599126134 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document +0.0003569948186744601 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document +0.00035166554847920186 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document +0.00035047994419295137 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document +0.0003561578193739437 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document +0.00035470866838811544 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document +0.00034216920464876335 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document +0.0003550021513075795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document +0.0003488045105938729 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document +0.0003513340720840151 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document +0.0003448558566387584 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document +0.0003460966026953241 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document +0.0003488157616036459 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document +0.0003446120387842362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document +0.000351528602987427 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document +0.00035661118227454713 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document +0.0003551342699877457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document +0.0003478953397924445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document +0.00034625782458988215 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document +0.0003527515447405871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document +0.00034823744889805696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document +0.00034823314560254406 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document +0.00035162668292961944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document +0.0003477307716074623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document +0.0003446457989477787 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document +0.00034782916273767795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document +0.0003517249130302248 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document +0.0003449873430908556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document +0.00034841291749669877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document +0.0003466028498941749 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document +0.0003486436831199424 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document +0.0003478279234211838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document +0.0003495903653274374 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document +0.00034896893881218957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document +0.000348941645312426 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document +0.0003474221308416894 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document +0.0003462621543839385 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document +0.0003669373860863891 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document +0.00034691156268163006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document +0.0003527774103765281 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document +0.00034684565672734663 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document +0.0003454250599604457 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document +0.0003541536557159006 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document +0.000345735737037366 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document +0.0003524669816385214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document +0.0003441817133096468 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document +0.0003519093265859089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document +0.00035080085480352095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document +0.00035285227929327434 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document +0.00034354836346901676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document +0.00034789770937373467 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document +0.000343665920520102 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document +0.0003490884931060568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document +0.00034380029463398654 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document +0.00034874768005099945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document +0.0003457058510967673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document +0.00034644265227023904 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document +0.00035008339858594957 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document +0.0003462377193296194 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document +0.0003620491787114201 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document +0.000348717011044469 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document +0.00034370072363913706 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document +0.0003551981066775649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document +0.0003500119496799342 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document +0.0003485082952669081 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document +0.0003508155580978919 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document +0.00035311375163251416 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document +0.00034945972003423253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document +0.0003474220353789879 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document +0.0003536443686585001 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document +0.0003560350489042953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document +0.0003493655927914396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document +0.0003528423977146383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document +0.00035255554724471217 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document +0.0003479760010190111 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document +0.00035458598862501956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document +0.0003458990560538315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document +0.00035157946422379875 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document +0.00034736860650169996 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document +0.0003529152313394119 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document +0.00034586294329524465 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document +0.00035707214923794877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document +0.0003509580363496512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document +0.00035244176725524474 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document +0.0003467539557999047 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document +0.00034919687962275546 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document +0.00035094031731719953 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document +0.0003484309008351352 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document +0.0003485409424916253 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document +0.0003499590776117838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document +0.0003492842758957848 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document +0.0003529712275178912 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document +0.0003566141287087449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document +0.0003649496522047409 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document +0.0003563218912208234 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document +0.00035614782126966145 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document +0.0003531944298453266 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document +0.0003535950949566616 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document +0.0003544295554928795 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document +0.0003519908503740376 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document +0.00035752817626134463 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document +0.0003515322689589972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document +0.0003486893890307115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document +0.0003446520464889867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document +0.0003509421562481707 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document +0.00035335015702909084 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document +0.0003490178167345008 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document +0.0003520497821155174 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document +0.0003549762618908944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document +0.00035072190850833103 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document +0.0003542458638526423 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document +0.000352419194572916 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document +0.0003545102564672614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document +0.0003495437992331806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document +0.0003542843376993964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document +0.000352827529313958 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document +0.00035442506093223886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document +0.0003496970719044257 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document +0.0003553096424442362 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document +0.00034986845565067564 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document +0.000352131055186658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document +0.0003527021708198983 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document +0.00034905885414547214 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document +0.0003583433842468394 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document +0.00034409435202828383 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document +0.00034846410520871483 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document +0.0003554459991927314 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document +0.00035310507471843076 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document +0.000350028910786098 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document +0.00035049727458009896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document +0.0003519047735925826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document +0.0003513027429919726 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document +0.0003626947260354396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document +0.0003500087324849783 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document +0.0003618315726725285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document +0.0003535385113938023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document +0.0003487064058517615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document +0.0003618709124780938 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document +0.00035040070335625915 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document +0.0003506279032267829 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document +0.0003498435310527524 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document +0.0003554634749821431 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document +0.00035091209738758963 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document +0.00035034103678978573 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document +0.00035398931854386146 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document +0.00035495529304989485 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document +0.00036067883473356603 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document + diff --git a/ALCF/data-lists/polaris/megawiki.txt b/ALCF/data-lists/polaris/megawiki.txt new file mode 100644 index 0000000000..4c4f47df5f --- /dev/null +++ b/ALCF/data-lists/polaris/megawiki.txt @@ -0,0 +1,262 @@ +6.322825248625475e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document +2.4432314037946264e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document +5.6313888721313454e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document +2.4208171781595055e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document +2.325811856369237e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document +2.4010790356322705e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document +5.36773610843632e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document +1.360574433501002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document +1.3076540344853244e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document +1.3386534334886313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document +1.2498103719605153e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document +1.403763836949682e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document +1.3636756723495417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document +1.2242489446940814e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document +1.2398255818973339e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document +1.2972616994216281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document +1.3947809855914134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document +1.3144843787829514e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document +1.1693809976572487e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document +1.3677252682893802e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document +1.3940876719849597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document +1.4222245138730965e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document +1.3201677767919704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document +1.1421717796486169e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document +1.2890514724498703e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document +1.3649507648749037e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document +1.2400732563490717e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document +1.1557681453277616e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document +1.2294483595964517e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document +1.2137484472122283e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document +1.3299663426456e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document +1.2461984216479532e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document +1.4666434217609636e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document +1.1876997894686238e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document +1.2939155338964078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document +1.3859590039728515e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document +1.317917848615668e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document +1.1335281536110342e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document +1.2889923952861426e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document +1.3471671647053326e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document +1.2221720014475102e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document +1.2632647276287541e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document +1.28276219004076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document +1.36213704321643e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document +1.2414858625261553e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document +1.3173700421883744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document +1.295597796725686e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document +1.242783936442904e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document +1.2417374088427464e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document +1.2134479405400744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document +1.3090040663304255e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document +1.2713470581614905e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document +5.5750231378906594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document +5.777597358425469e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document +5.349786767471258e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document +5.675165050453583e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document +5.482611216158831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document +5.065421899890121e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document +5.384718357480146e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document +4.872037363236061e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document +4.532709250783155e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document +5.7257963030489613e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document +4.9014365579652036e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document +5.722863552770969e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document +6.149911636146833e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document +5.2178057608273506e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document +4.990228161160431e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document +5.866186875255134e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document +5.004185734360719e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document +4.79401853705107e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document +5.435219965052376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document +5.035997225792266e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document +5.622401774211625e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document +5.028826157387559e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document +5.596379470128795e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document +6.027824493191489e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document +5.5358270009931474e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document +5.9839051807685496e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document +5.1221077499249595e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document +5.517228560620279e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document +5.1687858285052305e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document +5.684188244145645e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document +5.212693275535878e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document +4.8551007022784084e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document +5.4888506639203145e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document +5.345098688527242e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document +4.8506420625516594e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document +5.132168603397676e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document +5.719476795114223e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document +5.7448621149792696e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document +4.9068410568059265e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document +5.382937299647678e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document +4.8288432136304634e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document +5.841703200305416e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document +5.1589611587885584e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document +6.031113829732574e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document +5.4558202844532094e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document +5.341852317196142e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document +5.1402942738369954e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document +5.735421384377395e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document +5.473629863586958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document +5.4708993245733936e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document +4.931161863634078e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document +5.104173022127248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document +5.510157161510824e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document +5.652501401782597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document +5.7273656573031666e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document +5.638363224821738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document +5.6128115396668704e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document +5.00304877998141e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document +5.596120554779096e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document +5.5280923889040006e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document +5.223477917938408e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document +5.29472809986569e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document +2.205682378243213e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document +1.4367563720603185e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document +3.5506193487931076e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document +3.0442910855821778e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document +2.2540042508019627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document +2.6880163202623216e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document +2.534473148048727e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document +2.6560945431318916e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document +2.547470248967691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document +2.5248825388073738e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document +2.5828729575000054e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document +2.4026583817957736e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document +2.3930425429834413e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document +2.5037365362599724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document +2.6696745470595603e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document +2.140323051341762e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document +2.617354786691592e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document +1.538359101762691e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document +1.2871029252377856e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document +2.255195411289217e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document +2.4832313897952067e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document +9.303873918189968e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document +2.179532302620228e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document +1.9750517506901206e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document +2.7740420380648435e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document +2.7813714782319335e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document +4.1595357937609806e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document +2.741365122389175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document +2.117451071361901e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document +1.7132649760565998e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document +1.7492547092602047e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document +1.7499951097392276e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document +1.6632444789170958e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document +1.6678802252361607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document +1.5519208704558896e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document +1.652420992967167e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document +1.6119931034508755e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document +1.6638882076736552e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document +1.7198076782652946e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document +1.572927860565175e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document +1.5194822618169918e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document +1.6677776832669846e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document +1.595612492245688e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document +1.682350633181197e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document +1.663983380609724e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document +1.710187842689243e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document +1.5733697527539038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document +1.6972104757911438e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document +1.6610142847616577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document +1.61094882403031e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document +1.4789207305138325e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document +1.639299617676302e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document +1.3241204512116132e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document +8.582260726625535e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document +8.213000975576739e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document +9.549247732811947e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document +9.17242785339013e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document +7.632868223725218e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document +8.674401118222175e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document +9.124384255505347e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document +8.344222222417358e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document +8.992299957499065e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document +8.76689497361025e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document +7.973396239586015e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document +9.006935606644125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document +8.725545954955498e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document +1.215449694669174e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document +3.3041720284158646e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document +2.0593512412624502e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document +1.893608946986248e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document +1.737111666788535e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document +1.4915923449873955e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document +2.289370239067605e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document +2.8615335689614638e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document +8.847283630883125e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document +1.8175470362373804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document +1.8152226683368038e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document +1.789149655314284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document +1.7690523036477663e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document +1.8333732213753644e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document +1.8794105687718654e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document +1.721841156706417e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document +2.0612008685724796e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document +1.9297370681336376e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document +2.0188440409661018e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document +5.1741216329695265e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document +1.3417913926038429e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document +1.1010813016469651e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document +1.1252416134320087e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document +1.2801744104313002e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document +1.3041514955795817e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document +1.3428837580879075e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document +1.320809382267804e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document +1.3451566676555968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document +1.228284926657501e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document +1.2410599573923043e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document +1.3815343367377182e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document +1.3895126265148832e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document +1.2306773644401741e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document +1.32981021906281e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document +1.101337469221607e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document +1.513094184404692e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document +1.1073759547073234e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document +1.2879348765857567e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document +9.619595770228435e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document +1.2384340836286436e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document +1.1766667232211577e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document +1.2871049236196452e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document +1.2010645926497744e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document +1.3971428231518597e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document +1.2283733550547932e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document +1.2659530508255308e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document +1.551775613074462e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document +1.1169413343776979e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document +1.1433700593712463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document +4.964773647323492e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document +1.0995586595687313e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document +1.2957393071411267e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document +2.75899247407709e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document +2.8269344597344854e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document +2.329108187246831e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document +2.4231761430460284e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document +1.2434140512230442e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document +1.638718338352859e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document +3.272953556801187e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document +6.061314500486327e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document +1.2465979731210292e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document +1.2737557327967737e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document +1.038428658075627e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document +2.61666472045566e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document +3.6506873212272224e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document +1.5066359138295701e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document +1.1166290872121178e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document +1.5546966228590285e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document +1.2583434625014828e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document +1.3398826881300862e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document +1.2944933160515968e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document +1.0971437399901365e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document +1.2787922795775774e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document +1.404979227816985e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document +1.3344734431324463e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document +4.886031157107555e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document +3.277261443596394e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document +3.5057957685786495e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document +3.287625301718589e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document +3.1370056372668855e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document +3.186092015785841e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document +7.271819324142512e-06 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document diff --git a/ALCF/data-lists/polaris/open-web-math-train.txt b/ALCF/data-lists/polaris/open-web-math-train.txt new file mode 100644 index 0000000000..caab74fb9f --- /dev/null +++ b/ALCF/data-lists/polaris/open-web-math-train.txt @@ -0,0 +1,13 @@ +0.001451215788905126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document +0.0014486847196258788 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document +0.0008861032722895899 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document +0.0018119590809459816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document +0.0008916937917547129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document +6.960128832809415e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document +0.002008403651063623 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document +0.0014374900742131454 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document +0.00180213596996716 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document +0.001956178877532413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document +0.0008829547017667033 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document +0.0008910853619157279 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document +0.0018260998845299973 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document diff --git a/ALCF/data-lists/polaris/pes2o.txt b/ALCF/data-lists/polaris/pes2o.txt new file mode 100644 index 0000000000..7bb62d46b6 --- /dev/null +++ b/ALCF/data-lists/polaris/pes2o.txt @@ -0,0 +1,26 @@ +0.0012499632072059553 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document +0.00125398260359913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document +0.0012541704774729071 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document +0.0012527268234360602 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document +0.0012532925243737164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document +0.0012456396241204315 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document +0.0012589894424352072 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document +0.001508020123999618 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document +0.00333096950781965 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document +0.0033233414614415547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document +0.003512387990689828 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document +0.0035091382940513126 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document +0.003514155927147005 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document +0.003327108000579638 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document +0.003329106196589836 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document +0.003505604148738077 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document +0.003324825759567855 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document +0.0033248240149804913 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document +0.0033385962112851358 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document +0.0035043186296553615 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document +0.003340469505431529 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document +0.0035106889084796276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document +0.0033309469281030167 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document +0.003340337858029757 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document +0.003505919861097801 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document +0.0003882924098240512 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document diff --git a/ALCF/data-lists/polaris/reddit.txt b/ALCF/data-lists/polaris/reddit.txt new file mode 100644 index 0000000000..4f46ee0e64 --- /dev/null +++ b/ALCF/data-lists/polaris/reddit.txt @@ -0,0 +1,78 @@ +0.0005759963691850877 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document +0.0005959971675332674 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document +0.0006026179290353799 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document +0.0005824184320784846 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document +0.0005854598548616037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document +0.0005903767055633473 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document +0.0005930306490982049 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document +0.000569425602700746 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document +0.0005675060415179408 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document +0.0005772431621253389 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document +0.0005678026053826858 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document +0.0005700398263483378 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document +0.0005669467963528824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document +0.0005701015953324305 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document +0.0005795907287413296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document +0.0005735602737531164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document +0.0005749862745842101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document +0.0005693257015931971 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document +0.0005716568794795563 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document +0.0005761083919774021 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document +0.0005688343169797355 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document +0.0005807913190929842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document +0.0005710229258078636 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document +0.0005704083039826862 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document +0.0005862132348308056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document +0.0005717662049559556 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document +0.0005858155213694451 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document +0.0005812012281792392 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document +0.0005803981414588498 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document +0.0005700102108287723 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document +0.0005719243459052329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document +0.0005867253401661752 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document +0.0005731087218860733 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document +0.0005712197789109317 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document +0.0005702376926310089 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document +0.0005700411527742972 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document +0.0005828090098178196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document +0.0005770140826168056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document +0.0005723509664597896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document +0.0005755499231836962 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document +0.0005636407438471367 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document +0.0005640281556500104 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document +0.0005633159058766496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document +0.0005638034311151449 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document +0.0005630066273073224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document +0.0005631803831128559 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document +0.0005631228881679657 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document +0.0005628178701487633 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document +0.0005624448092256196 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document +0.0005620957024062329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document +0.0005614201504177484 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document +0.0005616890951464056 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document +0.0005611348559279058 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document +0.0005604238061828518 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document +0.0005603301490194237 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document +0.0005607291294548833 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document +0.0005605234569930727 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document +0.0005613778566640694 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document +0.0005610248539992471 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document +0.0005599977416780475 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document +0.0005603632562116935 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document +0.0005599177479509897 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document +0.0005595202318298379 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document +0.0005600975633499175 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document +0.0005614075491213365 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document +0.000612563885043477 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document +0.0005515469909644413 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document +0.0005526782014946906 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document +0.0005472463408095445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document +0.0005502284746004587 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document +0.0005414514790555363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document +0.0005513499500134784 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document +0.0005391391454105187 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document +0.0005415836910001838 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document +0.0005208132468536551 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document +0.0005889827143132871 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document +0.0005822520817765276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document +0.0004173155230758696 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document diff --git a/ALCF/data-lists/polaris/stack.txt b/ALCF/data-lists/polaris/stack.txt new file mode 100644 index 0000000000..971329159b --- /dev/null +++ b/ALCF/data-lists/polaris/stack.txt @@ -0,0 +1,26 @@ +0.0009994361338078242 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document +0.001087156194657966 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document +0.0010667737163656816 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document +0.0009602877882124873 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document +0.0008968956271971105 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document +0.0009198034843762967 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document +0.0009423901016715341 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document +0.0009674094553686345 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document +0.0009858331322519164 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document +0.0009970593645879198 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document +0.0010027035193731686 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document +0.0010128291154221853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document +0.0010215631382631918 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document +0.0010288663771461238 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document +0.0010346219929285867 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document +0.00104544019940344 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document +0.0010525172676724333 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document +0.0010609529620775127 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document +0.0010725892748610153 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document +0.0010818563598181568 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document +0.0010992760196793917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document +0.0011178992762079917 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document +0.001124687532085676 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document +0.001118303661267191 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document +0.0010206825575416534 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document +0.0005512280117499715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document diff --git a/ALCF/data-lists/polaris/starcoder.txt b/ALCF/data-lists/polaris/starcoder.txt new file mode 100644 index 0000000000..c675b0b84d --- /dev/null +++ b/ALCF/data-lists/polaris/starcoder.txt @@ -0,0 +1,50 @@ +0.004474659408857016 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document +0.00409944473890653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document +0.005137179939941845 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document +0.005143172251066109 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document +0.005206134363352808 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document +0.004892747858974329 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document +0.004844731352552902 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document +0.005308320169123755 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document +0.005124709815666577 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document +0.005424710744483826 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document +0.00538244648861977 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document +0.0029107284679086853 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document +0.0026825258998444705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document +0.0026904503191419243 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document +0.002687906577174073 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document +0.002850165346048818 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document +0.005322698571717847 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document +0.004450334290869719 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document +0.004700990083440683 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document +0.003903568556500995 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document +0.00390561515396931 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document +0.0039046402900912262 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document +0.003907454839379547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document +0.0038583224578603824 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document +0.0037914116657695 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document +0.003786665266798682 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document +0.003792000802430658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document +0.00319266847466091 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document +0.0032658716699838944 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document +0.0034801959532460023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document +0.0028307012092022594 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document +0.0028420360878146276 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document +0.0028410455248484914 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document +0.00283497183526842 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document +0.002840187195459487 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document +0.0028398709431369834 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document +0.004364722843422023 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document +0.004093255713117101 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document +0.004092331079566252 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document +0.004005326985579649 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document +0.0036205502856964207 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document +0.003625316793034984 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document +0.003604743435602363 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document +0.0035405823343673125 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document +0.0041601413517253945 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document +0.005886303658937057 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document +0.003600909532810332 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document +0.0034941365817168658 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document +0.0004992164842980224 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document + diff --git a/ALCF/data-lists/polaris/tulu.txt b/ALCF/data-lists/polaris/tulu.txt new file mode 100644 index 0000000000..a65ae2b93a --- /dev/null +++ b/ALCF/data-lists/polaris/tulu.txt @@ -0,0 +1,66 @@ +0.00032927705604725614 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document +0.0002860154190878753 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document +0.0002845217585425619 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document +0.0002743528685497456 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document +0.00026025323737738766 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document +0.00023493876414603155 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document +0.00029665994994226705 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document +0.00031808102075993956 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document +0.00031813573046011285 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document +0.0002711905171855542 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document +0.00028892513401817095 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document +0.00030003908676979083 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document +0.00026839878771944684 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document +0.00029155935002690497 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document +0.0002998624927624209 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document +0.0003091705447974841 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document +0.00026873195794309786 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document +0.00027721873498527547 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document +0.0002841662554024377 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document +0.0002839461156551537 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document +0.0002861705604659811 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document +0.0002460995649635886 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document +0.00019420142619795496 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document +0.00021967677816173628 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document +0.0002620283200480949 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document +0.0002433390542188936 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document +0.00021254976608350767 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document +0.00022094815569522115 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document +0.000342862378668244 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document +0.00033784225259118157 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document +0.0003367278459543952 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document +0.00029843279042852765 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document +0.0002926583661257988 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document +0.00029320337282010673 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document +0.00029281450669483455 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document +0.0002915338187002653 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document +0.0002864226923084572 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document +0.00028643439083586396 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document +0.00028253710956299054 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document +0.0002810856078805806 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document +0.00031474941344656715 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document +0.0002139130222205655 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document +0.0003084648871862831 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document +0.0003309477872140129 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document +0.0003360096824695161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document +0.0003355452655196557 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document +0.00038119390366386037 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document +0.00038078927630086064 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document +0.0003386200917551554 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document +0.0002158905159938882 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document +0.00021621682877018768 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document +0.00021553306942740535 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document +0.00021581563462722296 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document +0.0002157694110556169 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document +0.000215643699847159 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document +0.00021532716715168094 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document +0.00021531221326022472 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document +0.0002831801179028896 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document +0.0002514844936507595 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document +0.00031638782778107964 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document +0.0002749197545278445 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document +0.00026159721512464495 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document +0.0002630052420096968 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document +0.00031106811228913666 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document +0.0002852973415334161 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document +3.7555372465932136e-05 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document diff --git a/ALCF/data-lists/polaris/wiki.txt b/ALCF/data-lists/polaris/wiki.txt new file mode 100644 index 0000000000..7759120d87 --- /dev/null +++ b/ALCF/data-lists/polaris/wiki.txt @@ -0,0 +1,2 @@ +0.003548077173506675 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document +0.0018372203137874265 /eagle/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document From caa1a4bfdd159594de2615fc36a580a86a735386 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Apr 2024 11:02:07 -0500 Subject: [PATCH 217/268] Add `ALCF/test_polaris.sh` --- ALCF/test_polaris.sh | 84 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 ALCF/test_polaris.sh diff --git a/ALCF/test_polaris.sh b/ALCF/test_polaris.sh new file mode 100644 index 0000000000..cbe2b41c5e --- /dev/null +++ b/ALCF/test_polaris.sh @@ -0,0 +1,84 @@ +#!/bin/bash --login +# +# Run complete test of +# https://github.com/argonne-lcf/Megatron-DeepSpeed +# on Polaris @ ALCF +# to launch (inside an interactive `qsub -I` job) on Polaris: +# +# ```bash` +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed/ALCF +# $ bash test_polaris.sh +# ```` + +# EXIT ON ERROR(s) +set -euxo pipefail + +NOW="$(date "+%Y-%m-%d-%H%M%S")" + +######################################################## +# Setup / activate conda environment, +# mine is called q4-drop +######################################################## +setup_conda() { + if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then + export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" + micromamba activate 2024-04-25 + else + echo "Found existing python at: $(which python3)" + fi +} + + +######################################## +# Make sure ./tmp/Megatron-DeepSpeed +# does not already exist +######################################## +setup_megatron_deepspeed() { + OUTDIR="OUTPUTS/test-polaris-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + echo "Running test in: ${OUTDIR}" + echo "WORKING DIRECTORY: $(realpath $(pwd .))" + if [[ -d "Megatron-DeepSpeed" ]]; then + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." + exit + fi + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + if [[ -z "${GIT_BRANCH-}" ]]; then + git checkout "${GIT_BRANCH}" + fi +} + + +main() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + setup_conda + else + echo "Unable to setup python. Exiting" + exit 1 + fi + setup_megatron_deepspeed + export DEBUG=1 + export PBS_O_WORKDIR="$(pwd)" + export DATA_FILE_LIST=./ALCF/data-lists/polaris/books.txt + export ZERO_STAGE=1 + export NUM_LAYERS=10 + export MICRO_BATCH=8 + export TRAIN_ITER=20 + export TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee "test-polaris-${NOW}".log +} + +main From b534e09d000a343af0d65b0b6feb652303b74aa1 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Apr 2024 11:02:26 -0500 Subject: [PATCH 218/268] Fix duplicate loggers in `pretrain_gpt_alcf.py` --- pretrain_gpt_alcf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 7ed38614a7..f83ef05fc8 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -25,7 +25,7 @@ # # checkpoint_throughput_calculator # ) # from pathlib import Path -from enrich import get_logger +import logging import deepspeed from deepspeed.runtime.utils import see_memory_usage @@ -49,7 +49,7 @@ torch.cuda.set_device(LOCAL_RANK) # ------------------------------------------- # --- [TURN OFF LOGGER ON ALL RANK != 0] ---- -log = get_logger(__name__) +log = logging.getLogger(__name__) log.setLevel("INFO") if RANK == 0 else log.setLevel("CRITICAL") # ---- [SETUP WANDB FROM RANK 0] -------------- WANDB_MODE = os.environ.get('WANDB_MODE', None) From 2c4d7728fa15e762ad9ef896c2bf15c440d70f5e Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Apr 2024 11:03:11 -0500 Subject: [PATCH 219/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 88 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 31 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index fdd42b56b8..9b057e0c03 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -16,7 +16,19 @@ export WORKING_DIR="${WORKING_DIR}" printf "Using WORKING_DIR: %s\n" ${WORKING_DIR} -printJobInfo() { +function setupSrun() { + if [[ $(hostname) == login* || $(hostname) == nid* ]]; then + export NHOSTS="${SLURM_NNODES:-1}" + export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" + export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" + export SRUN_EXEC="srun --gpus ${NGPUS} --gpus-per-node ${NGPU_PER_HOST} -N ${NHOSTS} -n ${NGPUS} -l -u --verbose" + else + echo "Skipping setupSrun() on $(hostname)" + fi +} + + +function printJobInfo() { echo "++++++++++++++++++++++++++++++++++++++++++++++++++" echo "- MPICH_DIR=${MPICH_DIR:-${MPI_ROOT}}" echo "- Using $(which python3)" @@ -27,20 +39,33 @@ printJobInfo() { echo "++++++++++++++++++++++++++++++++++++++++++++++++++" } +function setupVenv() { + VENV_DIR="$1" + if [[ -d "${VENV_DIR}" ]]; then + echo "Found venv at: ${VENV_DIR}" + source "${VENV_DIR}/bin/activate" + else + echo "Skipping setupVenv() on $(hostname)" + fi +} -setupSrun() { - if [[ $(hostname) == login* || $(hostname) == nid* ]]; then - export NHOSTS="${SLURM_NNODES:-1}" - export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" - export NGPUS="$(( NHOSTS * NGPU_PER_HOST ))" - export SRUN_EXEC="srun --gpus ${NGPUS} --gpus-per-node ${NGPU_PER_HOST} -N ${NHOSTS} -n ${NGPUS} -l -u --verbose" +function loadCondaEnv() { + if [[ "${CONDA_EXE}" ]]; then + echo "Already inside ${CONDA_EXE}, exiting!" else - echo "Skipping setupSrun() on $(hostname)" + MODULE_STR="$1" + module load "conda/${MODULE_STR}" + nargs="$#" + if [[ "${nargs}" -ge 2 ]]; then + conda activate "$2" + else + conda activate base + fi fi } -setupLauncher() { +function setupLauncher() { # outdir=$1 if [[ -n "${DIST_LAUNCH}" && ${LAUNCH_CMD:-"MPICH"} != "deepspeed" ]]; then export LAUNCH_CMD="${DIST_LAUNCH} --cpu-bind depth -d 16 python3 -Wignore ${EXEC}" @@ -53,7 +78,7 @@ setupLauncher() { printf " %s" "$(printMagenta ${LAUNCH_CMD})" } -setDSlauncher() { +function setDSlauncher() { # launcher setting outdir=$1 export hfds="$outdir/hostfile_deepspeed" @@ -68,7 +93,7 @@ setDSlauncher() { fi } -setParams() { +function setParams() { LLAMA_ARGS="" # +----[Parallelism Settings] -------------------------------------------+ # +------[Aurora]--------||-------[SunSpot]-------------+ @@ -155,7 +180,7 @@ setParams() { } -setArgs() { +function setArgs() { # ---- Set DeepSpeed arguments -------------------------------- ds_args=" " ds_args=" --deepspeed ${ds_args}" @@ -186,7 +211,7 @@ setArgs() { } -make_ds_hostfile() { +function make_ds_hostfile() { export GPUS_PER_NODE="${GPUS_PER_NODE:-${NGPU_PER_HOST:-${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}}}" # ---- Make MPICH hostfile ---------------- hf="${HOSTFILE:-${PBS_NODEFILE}}" @@ -202,7 +227,7 @@ make_ds_hostfile() { # | 1. Git clone ezpz (if not found) | # | 2. Install ezpz (if not installed) | # +---------------------------------------+ -ezpz() { +function ezpz() { if [[ ! -d "${WORKING_DIR}/deps/ezpz" ]]; then mkdir -p "${WORKING_DIR}/deps" git clone https://github.com/saforem2/ezpz "${WORKING_DIR}/deps/ezpz" @@ -228,7 +253,7 @@ ezpz() { # | Save important environment variables to .deepspeed_env, which will be | # | forwarded to ALL ranks with DeepSpeed | # +------------------------------------------------------------------------+ -saveDSenv() { +function saveDSenv() { echo "Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env" { echo "PATH=${PATH}" ; @@ -240,7 +265,7 @@ saveDSenv() { } > .deepspeed_env } -setOutput() { +function setOutput() { # ---- Specify output location -------------------------------- export OUTPUT_PREFIX="ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_opt${OPT}" # OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" @@ -253,7 +278,7 @@ setOutput() { echo "!!!Please see logs at ${OUTPUT_DIR}" } -buildDSconfig() { +function buildDSconfig() { # ---- Build DeepSpeed Config --------------------------------- export CPU_OPTIMIZER="${CPU_OPTIMIZER:-0}" export DS_CONFIG="${WORKING_DIR}/ds-configs/ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" @@ -267,13 +292,13 @@ buildDSconfig() { } -sumWeights() { +function sumWeights() { local file_list=$1 weights=$(cat "${file_list}" | awk '{print $1}' | tr '\n' '\ ,\ ' | sed 's/^/[/g' | sed 's/$/]/g' | tr '\ ' "\,\ ") python3 -c "import numpy as np; print(np.sum(${weights}))" } -sumFiles() { +function sumFiles() { local rd=$1 for f in $("${rd}/*.txt"); do ws=$(sumWeights "${rd}/${f}") @@ -282,7 +307,7 @@ sumFiles() { } -setEnv() { +function setEnv() { # ---- [SunSpot] ------- || ---- [Aurora] -------------- if [[ $(hostname) == x1* || $(hostname) == x4* ]]; then # PBS_PARENT=$(dirname ${PBS_O_WORKDIR}) @@ -323,7 +348,7 @@ setEnv() { } -makeHostfiles() { +function makeHostfiles() { if [[ -n "${HOSTFILE}" ]]; then printf "!! USING CUSTOM HOSTFILE FROM: %s" "${HOSTFILE}" else @@ -333,7 +358,7 @@ makeHostfiles() { fi } -setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------------- +function setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------------- if [[ $(hostname) == x4* ]]; then # ---- [AURORA] ---- dfl_fallback="/home/foremans/anl_24_release_q4/llm.devkit/Megatron-DeepSpeed/data_file_list_reweighted.txt" elif [[ $(hostname) == x1* ]]; then @@ -370,7 +395,7 @@ setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------------- echo "--------------------" } -generateDSconfig() { +function generateDSconfig() { for v in "$GLOBAL_BATCH" "$MICRO_BATCH" "$GRAD_ACC_STEPS" "$ZERO_STAGE" \ "$PP" "$DTYPE" do @@ -532,33 +557,34 @@ $flops_profiler EOT } -printBlack() { +function printBlack() { printf "\e[1;30m%s\e[0m\n" "$@" } -printRed() { +function printRed() { printf "\e[1;31m%s\e[0m\n" "$@" } -printGreen() { +function printGreen() { printf "\e[1;32m%s\e[0m\n" "$@" } -printYellow() { +function printYellow() { printf "\e[1;33m%s\e[0m\n" "$@" } -printBlue() { +function printBlue() { printf "\e[1;34m%s\e[0m\n" "$@" } -printMagenta() { +function printMagenta() { printf "\e[1;35m%s\e[0m\n" "$@" } -printCyan() { +function printCyan() { printf "\e[1;36m%s\e[0m\n" "$@" } -printWhite() { + +function printWhite() { printf "\e[1;37m%s\e[0m\n" "$@" } From cfa6b52e82c29018ebe92642e0379f393fe26eb5 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Apr 2024 11:14:02 -0500 Subject: [PATCH 220/268] Update `ALCF/test_{polaris,sirius,sunspot}.sh` --- ALCF/test_polaris.sh | 10 +++++++--- ALCF/test_sirius.sh | 38 ++++++++++++++++++++++++++++++++------ ALCF/test_sunspot.sh | 35 +++++++++++++++++++++++++++++------ 3 files changed, 68 insertions(+), 15 deletions(-) diff --git a/ALCF/test_polaris.sh b/ALCF/test_polaris.sh index cbe2b41c5e..3a6734f61e 100644 --- a/ALCF/test_polaris.sh +++ b/ALCF/test_polaris.sh @@ -22,7 +22,7 @@ NOW="$(date "+%Y-%m-%d-%H%M%S")" ######################################################## setup_conda() { if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then - export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba + export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" micromamba activate 2024-04-25 @@ -46,7 +46,7 @@ setup_megatron_deepspeed() { exit fi git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed - if [[ -z "${GIT_BRANCH-}" ]]; then + if [[ -n "${GIT_BRANCH-}" ]]; then git checkout "${GIT_BRANCH}" fi } @@ -72,7 +72,11 @@ main() { setup_megatron_deepspeed export DEBUG=1 export PBS_O_WORKDIR="$(pwd)" - export DATA_FILE_LIST=./ALCF/data-lists/polaris/books.txt + export DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/polaris/books.txt" + if [[ ! -f "${DATA_FILE_LIST}" ]]; then + echo "Unable to find / use ${DATA_FILE_LIST}. Exiting." + exit 1 + fi export ZERO_STAGE=1 export NUM_LAYERS=10 export MICRO_BATCH=8 diff --git a/ALCF/test_sirius.sh b/ALCF/test_sirius.sh index 108649612c..39170fa47c 100755 --- a/ALCF/test_sirius.sh +++ b/ALCF/test_sirius.sh @@ -21,10 +21,14 @@ NOW="$(date "+%Y-%m-%d-%H%M%S")" # mine is called q4-drop ######################################################## setup_conda() { - export MAMBA_ROOT_PREFIX=/lus/tegu/projects/PolarisAT/foremans/micromamba - shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') - eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook --shell ${shell_name})" - micromamba activate 2024-04-23 + if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then + export MAMBA_ROOT_PREFIX=/lus/tegu/projects/PolarisAT/foremans/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook --shell ${shell_name})" + micromamba activate 2024-04-23 + else + echo "Found existing python at: $(which python3)" + fi } @@ -42,15 +46,37 @@ setup_megatron_deepspeed() { exit fi git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + if [[ -n "${GIT_BRANCH-}" ]]; then + git checkout "${GIT_BRANCH}" + fi } main() { - setup_conda + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + setup_conda + else + echo "Unable to setup python. Exiting" + exit 1 + fi setup_megatron_deepspeed export DEBUG=1 export PBS_O_WORKDIR="$(pwd)" - export DATA_FILE_LIST=./ALCF/data-lists/sirius/books.txt + export DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/sirius/books.txt" + if [[ ! -f "${DATA_FILE_LIST}" ]]; then + echo "Unable to find / use ${DATA_FILE_LIST}. Exiting." + exit 1 + fi export ZERO_STAGE=1 export NUM_LAYERS=10 export MICRO_BATCH=8 diff --git a/ALCF/test_sunspot.sh b/ALCF/test_sunspot.sh index 67f6868d43..b39bff7736 100755 --- a/ALCF/test_sunspot.sh +++ b/ALCF/test_sunspot.sh @@ -21,12 +21,13 @@ NOW="$(date "+%Y-%m-%d-%H%M%S")" # mine is called q4-drop ######################################################## setup_conda() { - if [[ "${SHELL}" = "/bin/zsh" ]]; then - eval "$(~/miniconda3/bin/conda shell.zsh hook)" + if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$(~/miniconda3/bin/conda shell hook -s posix)" + conda activate q4-drop else - eval "$(~/miniconda3/bin/conda shell.bash hook)" + echo "Found existing python at: $(which python3)" fi - conda activate q4-drop } @@ -44,15 +45,37 @@ setup_megatron_deepspeed() { exit fi git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + if [[ -n "${GIT_BRANCH-}" ]]; then + git checkout "${GIT_BRANCH}" + fi } main() { - setup_conda + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + setup_conda + else + echo "Unable to setup python. Exiting" + exit 1 + fi setup_megatron_deepspeed export DEBUG=1 export PBS_O_WORKDIR="$(pwd)" - export DATA_FILE_LIST=./ALCF/data-lists/sunspot/books.txt + export DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/sunspot/books.txt" + if [[ ! -f "${DATA_FILE_LIST}" ]]; then + echo "Unable to find / use ${DATA_FILE_LIST}. Exiting." + exit 1 + fi export ZERO_STAGE=1 export NUM_LAYERS=10 export MICRO_BATCH=8 From a3114bf898c035b4c4db366af4e6745c9bc82276 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Apr 2024 11:15:36 -0500 Subject: [PATCH 221/268] Add `ALCF/data-lists/sunspot/dolma_v1_7_file_list.txt` --- .../sunspot/dolma_v1_7_file_list.txt | 2419 +++++++++++++++++ 1 file changed, 2419 insertions(+) create mode 100644 ALCF/data-lists/sunspot/dolma_v1_7_file_list.txt diff --git a/ALCF/data-lists/sunspot/dolma_v1_7_file_list.txt b/ALCF/data-lists/sunspot/dolma_v1_7_file_list.txt new file mode 100644 index 0000000000..5d142522a7 --- /dev/null +++ b/ALCF/data-lists/sunspot/dolma_v1_7_file_list.txt @@ -0,0 +1,2419 @@ +0.0018520780893211373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0000_text_document +0.0017591050606817512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0001_text_document +0.001459052794333798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0002_text_document +0.0007405667281569194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0003_text_document +0.00019420030110896795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0004_text_document +0.0009008668715801845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0005_text_document +0.00015115827957143057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0006_text_document +0.0014552844319220648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0007_text_document +0.0012469861325685161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0008_text_document +0.00136412011372413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0009_text_document +0.0007064279699221103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0010_text_document +0.0008472240000687427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0011_text_document +0.0001984375713341955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0012_text_document +0.0005472773881697123 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0013_text_document +0.001815779629850992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0014_text_document +0.0018313600689757324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/algebraic-stack-train-0015_text_document +0.0002583902668716813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0000_text_document +0.0002646575141232155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0001_text_document +0.0003165521247456758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0002_text_document +0.0002920706460176214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0003_text_document +0.00028396813182810215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0004_text_document +0.00030445161883108107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0005_text_document +0.00031628781276576474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0006_text_document +0.0003083776568189157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0007_text_document +0.0003176359471472902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0008_text_document +0.0002536009369131698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0009_text_document +0.0003067491424681363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0010_text_document +0.0002597217257557784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0011_text_document +0.0003788556450109768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0012_text_document +0.0002796563272052598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0013_text_document +0.00033573826524290287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0014_text_document +0.00030523658022800287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0015_text_document +0.00032211552192240096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0016_text_document +0.0003329295675164247 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0017_text_document +0.0003101982186639862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0018_text_document +0.00032361798234223355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0019_text_document +0.0003495541581652915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0020_text_document +0.0002821637448858042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0021_text_document +0.00030399523537629673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0022_text_document +0.0002955658968247219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0023_text_document +0.00028942158502924254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0024_text_document +0.00028769546171490733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0025_text_document +0.0002938111057234182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0026_text_document +0.0002711150403010948 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0027_text_document +0.00031130095874747565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0028_text_document +0.0003002996118160777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0029_text_document +0.0003732757901604459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0030_text_document +0.00026784205751795894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0031_text_document +0.0002799626521661984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0032_text_document +0.00034334276069078164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0033_text_document +0.0003582469803674965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0034_text_document +0.00031094844818418623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0035_text_document +0.0002766228384977191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0036_text_document +0.00030297116159471485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0037_text_document +0.00027033888377464685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0038_text_document +0.00030090862368377933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0039_text_document +0.00028543875802490955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0040_text_document +0.00027559768459074204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0041_text_document +0.0003182185533962886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0042_text_document +0.0003311392971435837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0043_text_document +0.00028751652060804325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0044_text_document +0.000303466863212589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0045_text_document +0.00033400462801277524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0046_text_document +0.0002589234031777426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0047_text_document +0.0002913508598466723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0048_text_document +0.0002670572450004856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0049_text_document +0.00032027399105647656 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0050_text_document +0.00032188376258379377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0051_text_document +0.0003161585784100882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0052_text_document +0.0003184249182974135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0053_text_document +0.00030381336664000807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0054_text_document +0.0003190437442184283 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0055_text_document +0.0002537961798200545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0056_text_document +0.0003017817117223326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0057_text_document +0.00028685268513240224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0058_text_document +0.00031265179094451165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0059_text_document +0.00034708319096986816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0060_text_document +0.00026650837943080664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0061_text_document +0.00034588832248507335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0062_text_document +0.0002416982248399037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0063_text_document +0.0003089296918222243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0064_text_document +0.00029137184185700827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0065_text_document +0.00026464226846800774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0066_text_document +0.00030545397919456627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0067_text_document +0.0003206778460448875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0068_text_document +0.00030968971641110967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0069_text_document +0.00023325653928600864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0070_text_document +0.00030526899198338555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0071_text_document +0.00035376719076633584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0072_text_document +0.000290224385981026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0073_text_document +0.000294650083382008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0074_text_document +0.00028768858128616436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0075_text_document +0.00030856965235527843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0076_text_document +0.00030579942447879054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0077_text_document +0.0002863101084704357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0078_text_document +0.0002870032092492213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0079_text_document +0.000264182727569885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0080_text_document +0.0002974012367036449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0081_text_document +0.00032238412143059203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0082_text_document +0.00031683716893819036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0083_text_document +0.00031157434937617524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0084_text_document +0.0003411742735695989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0085_text_document +0.00026778444816570715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0086_text_document +0.0003037045797275201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0087_text_document +0.00027746114370081314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0088_text_document +0.00027148285946862043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0089_text_document +0.00028042950114678207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0090_text_document +0.0003235607816590721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0091_text_document +0.0003086692227306295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0092_text_document +0.00033990349455148105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0093_text_document +0.00030945053208470265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0094_text_document +0.00027309074552265303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0095_text_document +0.00028737393506316194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0096_text_document +0.0003098868328009879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0097_text_document +0.0002614229162588409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0098_text_document +0.0002884388407820923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/arxiv-0099_text_document +0.0031025147279277244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0000_text_document +0.003102019887362634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0001_text_document +0.0009996745994661548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/books-0002_text_document +0.0002406272620255565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0000_text_document +0.0002404825539493424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0001_text_document +0.00024062296575435581 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0002_text_document +0.00024069315766818953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0003_text_document +0.00024055829162263452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0004_text_document +0.00024062053397343032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0005_text_document +0.0002410715545206964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0006_text_document +0.00024024881846087368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0007_text_document +0.0002407074700790688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0008_text_document +0.00024072141428809043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0009_text_document +0.00024027710230872736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0010_text_document +0.0002409111299205489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0011_text_document +0.00024081954058275009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0012_text_document +0.00024086076794990912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0013_text_document +0.00024098672620832446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0014_text_document +0.00024068622303333862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0015_text_document +0.00024140627024291824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0016_text_document +0.0002414512033594384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0017_text_document +0.00024028742594941463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0018_text_document +0.00024018036089269645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0019_text_document +0.0002398347365034979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0020_text_document +0.00024006780153485276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0021_text_document +0.00024015620270419213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0022_text_document +0.0002408848259695227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0023_text_document +0.0002408023185278831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0024_text_document +0.00024021196580140326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0025_text_document +0.00024077677271297493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0026_text_document +0.00024087392454668027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0027_text_document +0.0002408071293824126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0028_text_document +0.00024042223828845715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0029_text_document +0.0002411484752360495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0030_text_document +0.00023605263746465907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0031_text_document +0.00023471222158326908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0032_text_document +0.00023432138580287644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0033_text_document +0.00023407385623382327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0034_text_document +0.00023487504174367091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0035_text_document +0.0002341843704976313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0036_text_document +0.00023421993170282486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0037_text_document +0.00023445057969132037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0038_text_document +0.0002337681680073047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0039_text_document +0.000234627964808109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0040_text_document +0.0002338942211888584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0041_text_document +0.00023403849286843386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0042_text_document +0.00023405641310796305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0043_text_document +0.00023349169562397965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0044_text_document +0.00023381157386048856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0045_text_document +0.00023388742993790587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0046_text_document +0.00023363103829469813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0047_text_document +0.00023421141834630477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0048_text_document +0.00023420564352232565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0049_text_document +0.00023367463699173143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0050_text_document +0.00023344969163567033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0051_text_document +0.00023372196941547188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0052_text_document +0.00023399207645297834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0053_text_document +0.00023357915605505856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0054_text_document +0.00023337585642190864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0055_text_document +0.00023385005470157914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0056_text_document +0.00023301533534493465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0057_text_document +0.00023377864302541782 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0058_text_document +0.00023323745848621437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0059_text_document +0.0002330594611151835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0060_text_document +0.0002334149675026783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0061_text_document +0.00023198945902291534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0062_text_document +0.00023023784834634142 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0063_text_document +0.00022985623060187217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0064_text_document +0.0002292605284569516 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0065_text_document +0.00022926593333048894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0066_text_document +0.00022922766406807777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0067_text_document +0.00022898153911167426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0068_text_document +0.0002292473111593315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0069_text_document +0.000228804579400424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0070_text_document +0.00022865485613513526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0071_text_document +0.00022937426835887895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0072_text_document +0.00022917388311587372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0073_text_document +0.0002291660582019043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0074_text_document +0.00022907895248360543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0075_text_document +0.0002294617879920205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0076_text_document +0.0002290452150516566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0077_text_document +0.00022943405619715553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0078_text_document +0.0002296271421006204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0079_text_document +0.00022854791372910372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0080_text_document +0.00022923123467686557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0081_text_document +0.00022852404355738494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0082_text_document +0.00022847798660086642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0083_text_document +0.0002289604586810316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0084_text_document +0.00022835479834950643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0085_text_document +0.0002289149402884243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0086_text_document +0.00022806655474763446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0087_text_document +0.00022826296420992974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0088_text_document +0.00022906829636213627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0089_text_document +0.0002287628414466998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0090_text_document +0.0002282673911253445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0091_text_document +0.00022869309841939134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0092_text_document +0.0002281540116815451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0093_text_document +0.0002259755756162738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0094_text_document +0.00022562331285233504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0095_text_document +0.0002259061146106053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0096_text_document +0.00022567670836663787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0097_text_document +0.00022573165387587061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0098_text_document +0.00022508514961670572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0099_text_document +0.00022564642513773356 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0100_text_document +0.00022563088621998788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0101_text_document +0.0002250438755373707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0102_text_document +0.00022524465346241134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0103_text_document +0.00022531737657666812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0104_text_document +0.00022444687519363458 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0105_text_document +0.00022460397498596298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0106_text_document +0.00022454218976501763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0107_text_document +0.00022447528843671366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0108_text_document +0.00022501666332178926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0109_text_document +0.00022453752304377972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0110_text_document +0.00022484451871163002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0111_text_document +0.00022465678847154914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0112_text_document +0.00022453180917044732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0113_text_document +0.0002247278486823009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0114_text_document +0.00022465794828242097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0115_text_document +0.00022431000701925386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0116_text_document +0.00022476020248460963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0117_text_document +0.00022467531771795015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0118_text_document +0.0002236391309945234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0119_text_document +0.00022458764920536007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0120_text_document +0.00022430877426744415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0121_text_document +0.0002247047786127192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0122_text_document +0.0002245298090400035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0123_text_document +0.0002245648831396188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0124_text_document +0.00022292894729820784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0125_text_document +0.00022236668082957533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0126_text_document +0.0002217622659895442 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0127_text_document +0.00022252452726732609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0128_text_document +0.00022135333211363678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0129_text_document +0.0002214571757787971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0130_text_document +0.0002217188139237798 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0131_text_document +0.00022144214894640303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0132_text_document +0.00022100172806631854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0133_text_document +0.00022156392409199052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0134_text_document +0.00022134830143710272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0135_text_document +0.00022158598922529453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0136_text_document +0.00022142932483041377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0137_text_document +0.00022120980907786554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0138_text_document +0.00022117917738112441 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0139_text_document +0.00022077089397851235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0140_text_document +0.00022093265074996711 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0141_text_document +0.00022091299741377004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0142_text_document +0.0002205849150703338 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0143_text_document +0.0002210648204787979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0144_text_document +0.0002214235747364102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0145_text_document +0.00022083907302221787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0146_text_document +0.0002206334237915964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0147_text_document +0.00022065193929912214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0148_text_document +0.00022079775597767288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0149_text_document +0.00022091492909963518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0150_text_document +0.00022095009987097293 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0151_text_document +0.0002208150577180165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0152_text_document +0.00022085759102772088 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0153_text_document +0.00022073789170129016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0154_text_document +0.00022049322781182384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0155_text_document +0.00022083270617761285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0156_text_document +0.00021982452827473632 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0157_text_document +0.00021899870446514259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0158_text_document +0.00021890358773356361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0159_text_document +0.00021875556609042841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0160_text_document +0.00021861195987201226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0161_text_document +0.00021856782186167455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0162_text_document +0.00021912837771543515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0163_text_document +0.00021900213768517756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0164_text_document +0.00021871675851390374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0165_text_document +0.0002180537056545586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0166_text_document +0.0002188196714327129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0167_text_document +0.00021851362624523464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0168_text_document +0.0002183236795498736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0169_text_document +7.291153618675672e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/c4-0170_text_document +0.0003742481815405742 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0000_text_document +0.00038204855962733055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0001_text_document +0.00038821818392663593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0002_text_document +0.00038723332988783727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0003_text_document +0.00038916141142149904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0004_text_document +0.00038049542523949033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0005_text_document +0.0003854755539534284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0006_text_document +0.00024202756466512517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0007_text_document +0.0003915405155008087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0008_text_document +0.0003927382151931033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0009_text_document +0.0003839151202260479 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0010_text_document +0.00040006817468967907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0011_text_document +0.00040318965964443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0012_text_document +0.0003831013019452741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0013_text_document +0.00039166638383204036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0014_text_document +0.00039962784023961004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0015_text_document +0.00039536707853602614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0016_text_document +0.0004204304698247758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0017_text_document +0.00041538899178693555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0018_text_document +0.00039186953333675306 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0019_text_document +0.00038945837196504305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0020_text_document +0.0003919951238929062 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0021_text_document +0.00044377065718528966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0022_text_document +0.0004407759068603017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0023_text_document +0.0002487811895843715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0024_text_document +0.00039349432045556636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0025_text_document +0.00041223198559462343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0026_text_document +0.0004036573014830213 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0027_text_document +0.0003825982215521807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0028_text_document +0.00040386867133151386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0029_text_document +0.00024460575279105167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0030_text_document +0.000269029789531335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0031_text_document +0.0003573757493252864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0032_text_document +0.0004600876681392076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0033_text_document +0.0002605354166397086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0034_text_document +0.0003882502452157999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0035_text_document +0.0002466747612126512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0036_text_document +0.0004024726105072402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0037_text_document +0.00040820631128483644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0038_text_document +0.0002691094350403538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0039_text_document +0.00026916830387277267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0040_text_document +0.0004204663297880574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0041_text_document +0.00042379698687085554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0042_text_document +0.0004502169227311871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0043_text_document +0.0002661708937015295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0044_text_document +0.00031239486948031334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0045_text_document +0.0003109054589936201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0046_text_document +0.00045873053079760646 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0047_text_document +0.00022904931423244635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0048_text_document +0.0003813462028433663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0049_text_document +0.00039188129256500874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0050_text_document +0.00045124222276983765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0051_text_document +0.00048138658436853695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0052_text_document +0.0003944178776279866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0053_text_document +0.00039941569676754006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0054_text_document +0.00037952761190240494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0055_text_document +0.0003944870860881476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0056_text_document +0.0003891842411856621 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0057_text_document +0.000387688981934861 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0058_text_document +0.00039197953876258005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0059_text_document +0.00039007915280311206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0060_text_document +0.0003995520363699188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0061_text_document +0.00039230985654592406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0062_text_document +0.0003929472067173851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0063_text_document +0.0003924096172671473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0064_text_document +0.0003881636143629905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0065_text_document +0.000389790617937084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0066_text_document +0.00037351762309221023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0067_text_document +0.0003630196170929407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0068_text_document +0.00033532465765142113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0069_text_document +0.0003076088685761823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0070_text_document +0.00039463850897720803 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0071_text_document +0.0002843816115231449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0072_text_document +0.0002909175709416474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0073_text_document +0.00028867170997202486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0074_text_document +0.0002838644617723659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0075_text_document +0.00029027869525543416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0076_text_document +0.0002821339567560056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0077_text_document +0.0002922988877045601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0078_text_document +0.0002866955958315786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0079_text_document +0.0002865271754558126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0080_text_document +0.0002861247475618473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0081_text_document +0.0002826681072408606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0082_text_document +0.0002849746458282827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0083_text_document +0.0002816966633435316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0084_text_document +0.00026255342235948463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0085_text_document +0.0002552895098829678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0086_text_document +0.00025990194083107813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0087_text_document +0.0002524062657685835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0088_text_document +0.0002538577379748611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0089_text_document +0.0002561415177406761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0090_text_document +0.00026206253059694905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0091_text_document +0.00026168095406910565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0092_text_document +0.0002601305742008613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0093_text_document +0.00025200823006814814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0094_text_document +0.0003229951981263502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0095_text_document +0.00037289448266476045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0096_text_document +0.0003807825862179898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0097_text_document +0.0003616333738191483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0098_text_document +0.0003665117918907636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0099_text_document +0.0003684186453633228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0100_text_document +0.0003589330610806066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0101_text_document +0.00036383861418030395 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0102_text_document +0.000359841363355303 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0103_text_document +0.00036431044063050464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0104_text_document +0.0003668574090358279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0105_text_document +0.000362768263620199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0106_text_document +0.0003501888032771077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0107_text_document +0.000352401968221528 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0108_text_document +0.0003541019701869794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0109_text_document +0.0003628121865546891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0110_text_document +0.0003752582953758773 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0111_text_document +0.00037902046230424966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0112_text_document +0.0003777927146925147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0113_text_document +0.0003760676130509053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0114_text_document +0.00034046049078755405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0115_text_document +0.0003338847563259091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0116_text_document +0.00033294499102761794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0117_text_document +0.0004912026198265864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0118_text_document +0.00032064363474664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0119_text_document +0.00032154190389541214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0120_text_document +0.00032309660151746207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0121_text_document +0.00031181143365304544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0122_text_document +0.00031046092294569104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0123_text_document +0.00031150165249068046 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0124_text_document +0.0003041314265988224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0125_text_document +0.0003024834909739394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0126_text_document +0.0003019936835833604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0127_text_document +0.000292329665283177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0128_text_document +0.0002867061143144972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0129_text_document +0.00028443615610701707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0130_text_document +0.00028462291013755945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0131_text_document +0.0002793538601205013 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0132_text_document +0.00027306573977044246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0133_text_document +0.00027097155673336525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0134_text_document +0.0002752934202112985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0135_text_document +0.00043042012694697647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0136_text_document +0.00047495648822986177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0137_text_document +0.00047755032493473855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0138_text_document +0.0004706974343933747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0139_text_document +0.00046682163297771817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0140_text_document +0.0004616765425874178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0141_text_document +0.00030644496751628097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0142_text_document +0.0002909492555358308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0143_text_document +0.00027272036068261724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0144_text_document +0.0004101070217315588 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0145_text_document +0.0003728914338834357 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0146_text_document +0.00036546911442305647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0147_text_document +0.0003669945482407483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0148_text_document +0.0003715902407424017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0149_text_document +0.00035837486406683366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0150_text_document +0.0003573318538685469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0151_text_document +0.0003553784893071916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0152_text_document +0.0004920659809912352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0153_text_document +0.0004533619411303183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0154_text_document +0.00045067066057818706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0155_text_document +0.00044396985139270645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0156_text_document +0.00043198288204468477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0157_text_document +0.00043005174223738454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0158_text_document +0.00041847118430776784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0159_text_document +0.00042952036375796664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0160_text_document +0.00043420594647324267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0161_text_document +0.0003461123241053012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0162_text_document +0.0003408581597849182 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0163_text_document +0.00033172705422182547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0164_text_document +0.0003392566490686136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0165_text_document +0.00033578341518385483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0166_text_document +0.0003439196710518844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0167_text_document +0.00034559163447085543 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0168_text_document +0.00033762478642902825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0169_text_document +0.00033215210055107224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0170_text_document +0.00033423579608014966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0171_text_document +0.0004963355016025102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0172_text_document +0.0004996862761456923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0173_text_document +0.0005000551829325451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0174_text_document +0.0005004212610098755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0175_text_document +0.00027768695585500585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0176_text_document +0.00028395983854338433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0177_text_document +0.00027835826303062254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0178_text_document +0.0002740073176010804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0179_text_document +0.0002791830529274016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0180_text_document +0.0002796863816194411 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0181_text_document +0.00026697453022672804 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0182_text_document +0.0002594197440280141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0183_text_document +0.0003779565697649222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0184_text_document +0.00041835823476586606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0185_text_document +0.00043788493575265915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0186_text_document +0.0002731731970096006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0187_text_document +0.000276305847423402 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0188_text_document +0.0002704955773958623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0189_text_document +0.0002629635944827518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0190_text_document +0.000260070956974436 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0191_text_document +0.00025661553791456334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0192_text_document +0.00025794727207576157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0193_text_document +0.00025295733980001527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0194_text_document +0.0003788106407021029 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0195_text_document +0.0004882344027669431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0196_text_document +0.0003275324309642705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0197_text_document +0.0004803401856640094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0198_text_document +0.00046720138323433943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0199_text_document +0.00043527810307095335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0200_text_document +0.00043905395741627827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0201_text_document +0.00048774175867331425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0202_text_document +0.00048380704121346737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0203_text_document +0.0004779011848346118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0204_text_document +0.00046255587581908036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0205_text_document +0.00045127922880511576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0206_text_document +0.0004503891485256095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0207_text_document +0.0004450142332303422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0208_text_document +0.00044630282482516654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0209_text_document +0.00044325014465743616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0210_text_document +0.0004263874842796447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0211_text_document +0.0004217530913646938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0212_text_document +0.000415120314341852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0213_text_document +0.00040987168279144537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0214_text_document +0.00033468337266607834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0215_text_document +0.0003353094464683005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0216_text_document +0.0004833936821707294 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0217_text_document +0.00047194878988920935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0218_text_document +0.0004648324126996427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0219_text_document +0.0004562345003964941 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0220_text_document +0.0004933203505465098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0221_text_document +0.0003530166075325466 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0222_text_document +0.00035368548192804685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0223_text_document +0.0004872620828289663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0224_text_document +0.00048293889392426456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0225_text_document +0.00047936768462267655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0226_text_document +0.00047821013991587545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0227_text_document +0.0004660610308564753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0228_text_document +0.000394683430103437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0229_text_document +0.00039165053441571324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0230_text_document +0.0003906936040164381 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0231_text_document +0.00038074803919159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0232_text_document +0.0003686529291578143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0233_text_document +0.00035832920428870976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0234_text_document +0.00035929024535947033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0235_text_document +0.0003538226556050544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0236_text_document +0.0003584167868708799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0237_text_document +0.0003480507542594234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0238_text_document +0.0003413709023543034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0239_text_document +0.00034001304759361455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0240_text_document +0.00033430532902756514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0241_text_document +0.00046519252660631277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0242_text_document +0.0002938876402514769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0243_text_document +0.00028676090994509047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0244_text_document +0.00027296150117506716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0245_text_document +0.00026513502621960483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0246_text_document +0.0002680081327926125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0247_text_document +0.00025831225828720344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0248_text_document +0.00026647037295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0249_text_document +0.0002525733734572654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0250_text_document +0.00025831708887575375 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0251_text_document +0.00042487627444443476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0252_text_document +0.0004951213245023891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0253_text_document +0.0004804051413177752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0254_text_document +0.0004662397611340532 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0255_text_document +0.0004550138655253933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0256_text_document +0.00044494909122746795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0257_text_document +0.0002899112253051385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0258_text_document +0.0004372879736279761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0259_text_document +0.0004529568099252922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0260_text_document +0.00045127826158829573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0261_text_document +0.0004436558176737439 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0262_text_document +0.0004419233237678378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0263_text_document +0.000434589215880319 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0264_text_document +0.00029153613207706566 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0265_text_document +0.0004312458058738854 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0266_text_document +0.00028741854968757313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0267_text_document +0.00046853200754421234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0268_text_document +0.0004949145252030074 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0269_text_document +0.00044459683920483167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0270_text_document +0.0003836095306696336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0271_text_document +0.0003789760237872398 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0272_text_document +0.0003749227438304427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0273_text_document +0.0003628558277173369 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_head-0274_text_document +0.00039468301394041474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0000_text_document +0.00038874701821614864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0001_text_document +0.0004158492456077867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0002_text_document +0.00042360504554060077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0003_text_document +0.00040386729844317623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0004_text_document +0.00027595096702902474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0005_text_document +0.00043638766787829135 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0006_text_document +0.0002218691596850179 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0007_text_document +0.0004437566108089954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0008_text_document +0.0003889996411609667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0009_text_document +0.00043454421906537704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0010_text_document +0.0004522564392830988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0011_text_document +0.00041517835659357416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0012_text_document +0.0002614360863446896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0013_text_document +0.00037543522111463596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0014_text_document +0.0004386190133514781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0015_text_document +0.00046358333286115075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0016_text_document +0.00043186261317942404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0017_text_document +0.0002377581602097957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0018_text_document +0.00025973334085074254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0019_text_document +0.00040139099332000796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0020_text_document +0.00043674860686687174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0021_text_document +0.00040853289309329373 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0022_text_document +0.000242910191729688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0023_text_document +0.0004431071731750582 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0024_text_document +0.0004388092670482523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0025_text_document +0.000381418866255965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0026_text_document +0.0004100117296419717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0027_text_document +0.00042469230366022745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0028_text_document +0.00041744151905374254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0029_text_document +0.00022835699906752945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0030_text_document +0.0004380161085387397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0031_text_document +0.00044803212381807456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0032_text_document +0.00040554932796137236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0033_text_document +0.0004234508646347761 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0034_text_document +0.00043341209652360653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0035_text_document +0.00023966604734537185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0036_text_document +0.000259165907316014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0037_text_document +0.0004270653021833602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0038_text_document +0.0004341547032162028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0039_text_document +0.0004111478117275994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0040_text_document +0.0004299383567984396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0041_text_document +0.0004241899124590779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0042_text_document +0.0004502719349364145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0043_text_document +0.00038994621469645615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0044_text_document +0.0003859912398894952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0045_text_document +0.0004247535950310557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0046_text_document +0.000386982084327716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0047_text_document +0.0004196451040053251 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0048_text_document +0.0004096278509782259 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0049_text_document +0.0004373334932695721 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0050_text_document +0.0004180889975240641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0051_text_document +0.00042079636929672745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0052_text_document +0.00038063574611812913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0053_text_document +0.0003817505891515542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0054_text_document +0.0004420096268860222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0055_text_document +0.00039182670726410623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0056_text_document +0.0003635667850372299 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0057_text_document +0.00041564996472055667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0058_text_document +0.000400529358757286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0059_text_document +0.0003939113874958451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0060_text_document +0.00039066622068940996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0061_text_document +0.0004290098538807143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0062_text_document +0.0004240739958197099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0063_text_document +0.00040775392659215333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0064_text_document +0.0004091634200396925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0065_text_document +0.00042299190476617914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0066_text_document +0.0003701492680344151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0067_text_document +0.0003807353844384635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0068_text_document +0.00038813507771983156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0069_text_document +0.00040072346558408346 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0070_text_document +0.0003603595180423597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0071_text_document +0.00038799421353112465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0072_text_document +0.00037575235582264926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0073_text_document +0.0004239190342959713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0074_text_document +0.0004606044799136546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0075_text_document +0.00045107950652529253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0076_text_document +0.0004391947201871058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0077_text_document +0.0004457516661123035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0078_text_document +0.0004301297170991686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0079_text_document +0.00044661704164586694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0080_text_document +0.0004438849846114837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0081_text_document +0.0004444205734316823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0082_text_document +0.0004190924165303394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0083_text_document +0.00043942581131677875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0084_text_document +0.00021568459798090663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0085_text_document +0.0003814929225407199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0086_text_document +0.0003217453179359235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0087_text_document +0.00031719591470267974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0088_text_document +0.00032434115726922137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0089_text_document +0.0004079911120371051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0090_text_document +0.000329492766381148 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0091_text_document +0.0003845916162001633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0092_text_document +0.0003835208964390098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0093_text_document +0.00037847334157173194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0094_text_document +0.00038296039903791865 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0095_text_document +0.00037896336828472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0096_text_document +0.00037620974396391355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0097_text_document +0.00037420590727111843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0098_text_document +0.000340490625886403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0099_text_document +0.0003078314411035827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0100_text_document +0.00034153990750656097 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0101_text_document +0.0003308858103982067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0102_text_document +0.0003452640607156025 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0103_text_document +0.00033095276418403455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0104_text_document +0.0003116308995860414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0105_text_document +0.00032446713226408477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0106_text_document +0.0003015816821912984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0107_text_document +0.00031612418775706894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0108_text_document +0.0003278516344971041 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0109_text_document +0.00033079446736097217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0110_text_document +0.00032278977146550837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0111_text_document +0.00032065272988207914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0112_text_document +0.0003936696452406576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0113_text_document +0.0003450109536627789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0114_text_document +0.0003339787189919641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0115_text_document +0.0003284303856176974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0116_text_document +0.00033652677276843477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0117_text_document +0.0003257822443845694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0118_text_document +0.0003293985569149334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0119_text_document +0.0003310360260148262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0120_text_document +0.0003233770986418526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0121_text_document +0.0003172280092149422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0122_text_document +0.0003160674744292835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0123_text_document +0.00030931090289598506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0124_text_document +0.0003093173886443107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0125_text_document +0.00033167847081104083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0126_text_document +0.00031131501311729723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0127_text_document +0.00031046608876279845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0128_text_document +0.00030569235942207244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0129_text_document +0.00030777943671285197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0130_text_document +0.00029303314290956683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0131_text_document +0.0003045824546400205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0132_text_document +0.00030360880677729793 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0133_text_document +0.00031646239964835433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0134_text_document +0.0003129122300603785 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0135_text_document +0.00031060464956661433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0136_text_document +0.000311819032500067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0137_text_document +0.0002977872483902282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0138_text_document +0.0003009448600922438 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0139_text_document +0.00028610292098537774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0140_text_document +0.0002988326876216654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0141_text_document +0.00028550828372819075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0142_text_document +0.0002830381750875739 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0143_text_document +0.0002848495855927156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0144_text_document +0.0002856443760308144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0145_text_document +0.00027442895344188584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0146_text_document +0.0002681160554049462 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0147_text_document +0.0003421482544126989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0148_text_document +0.0004005872948449718 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0149_text_document +0.0003930123959320308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0150_text_document +0.0003867271832275778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0151_text_document +0.000380805140455254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0152_text_document +0.0003814769861947819 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0153_text_document +0.00038025170883282324 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0154_text_document +0.0003738026647867475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0155_text_document +0.00018960856915036276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0156_text_document +0.0003697177501953134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0157_text_document +0.00036674194328136693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0158_text_document +0.00036447406838697555 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0159_text_document +0.00036686410861101255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0160_text_document +0.00035915267825103423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0161_text_document +0.0003624758404026675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0162_text_document +0.0002822812140180794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0163_text_document +0.00030620512946920813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0164_text_document +0.000294249776520589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0165_text_document +0.00030238536967523434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0166_text_document +0.00029509593361580754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0167_text_document +0.0002906912701830899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0168_text_document +0.0002921944165474959 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0169_text_document +0.00028358919691127954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0170_text_document +0.0002813182772323272 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0171_text_document +0.00027442640800299205 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0172_text_document +0.0002747820342933984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0173_text_document +0.0002747584403979717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0174_text_document +0.00027499129634862444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0175_text_document +0.0002712050404257197 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0176_text_document +0.0002616256943143254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0177_text_document +0.00026769938929002815 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0178_text_document +0.00038396081322727017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0179_text_document +0.0003863140490027991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0180_text_document +0.00037702277513203237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0181_text_document +0.0003633274156107032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0182_text_document +0.0003587473889240435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0183_text_document +0.0003507672084278415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0184_text_document +0.00033776425499780385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0185_text_document +0.0003377914127574796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0186_text_document +0.00032948015659161326 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0187_text_document +0.00033245638541392985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0188_text_document +0.00031080707640648695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0189_text_document +0.0002976903331149755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0190_text_document +0.0002965121463725523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0191_text_document +0.0002933849695266647 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0192_text_document +0.0002837035078508233 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0193_text_document +0.00028684569079589323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0194_text_document +0.0003145192320802359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0195_text_document +0.0003566937253273515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0196_text_document +0.0003470199109592918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0197_text_document +0.0003060245312041868 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0198_text_document +0.0002650817213818789 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0199_text_document +0.0002643604938780134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0200_text_document +0.000299350876031416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0201_text_document +0.0003178540797697938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0202_text_document +0.000271850367887767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0203_text_document +0.00031349896596549 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0204_text_document +0.00031749734412765755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0205_text_document +0.0003791137842391209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0206_text_document +0.0003742334169957992 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0207_text_document +0.0003705639757351107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0208_text_document +0.0003126986769797042 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0209_text_document +0.00031038132814561196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0210_text_document +0.00036464437173804883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0211_text_document +0.0003569480488951322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0212_text_document +0.0003541239221619106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0213_text_document +0.00035315297411308053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0214_text_document +0.0003572451925404141 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0215_text_document +0.0003514986129411253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0216_text_document +0.0003521798298425866 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0217_text_document +0.00034553677439244716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0218_text_document +0.000349004719809412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0219_text_document +0.0003468247484872769 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0220_text_document +0.0003465822608356558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0221_text_document +0.00035410983132162007 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0222_text_document +0.0003487908354969444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0223_text_document +0.0003479024763238147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0224_text_document +0.000341412530646823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0225_text_document +0.00034451316273667034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0226_text_document +0.0002618849993484869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0227_text_document +0.00026788679978901144 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0228_text_document +0.00027450670773227214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0229_text_document +0.0002661273129899329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0230_text_document +0.00026836569676402957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0231_text_document +0.00026155876975483236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0232_text_document +0.0002609276830117151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0233_text_document +0.0002644161630512771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0234_text_document +0.00036789208972872557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0235_text_document +0.00037829849439990513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0236_text_document +0.0003788894943523098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0237_text_document +0.0003617207777959397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0238_text_document +0.0002541334487248998 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0240_text_document +0.0002707945538071073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0241_text_document +0.00027046282716455214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0242_text_document +0.0002652443167243215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0243_text_document +0.0002685859923850986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0244_text_document +0.00025734961751176414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0245_text_document +0.000259041720872915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0246_text_document +0.00025340107274823446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0247_text_document +0.00025757135121837893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0248_text_document +0.00025617700500574084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0249_text_document +0.0002566931670562857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0250_text_document +0.0002543871190716101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0251_text_document +0.00024997565589481713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0252_text_document +0.0002954079779456287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0253_text_document +0.00034890741135252835 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0254_text_document +0.0003473298137731525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0255_text_document +0.0003296959618486435 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0256_text_document +0.0003304520061604598 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0257_text_document +0.00032377956175729824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0258_text_document +0.00031700696295168713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0259_text_document +0.0003060382346081943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0260_text_document +0.0003012003005056863 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0261_text_document +0.0002981074073993884 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0262_text_document +0.0002922128825950705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0263_text_document +0.000348901087722931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0264_text_document +0.0003408286289467841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0265_text_document +0.0003410649680770183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0266_text_document +0.0003358524215576502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0267_text_document +0.0003343661874989231 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0268_text_document +0.00032810573699389156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0269_text_document +0.00032261449539097497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0270_text_document +0.0003162694866049203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0271_text_document +0.0003158381156468853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0272_text_document +0.000317376061083603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0273_text_document +0.0003125788639953052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0274_text_document +0.0003010105041885602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0275_text_document +0.0003065865059090678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0276_text_document +0.0003084275726508053 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0277_text_document +0.00030966560718296085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0278_text_document +0.0002957728057853081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0279_text_document +0.00029904164542325336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0280_text_document +0.0002955358888729187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0281_text_document +0.00028692976446931544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0282_text_document +0.0002923476214935797 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0283_text_document +0.0002893691697212419 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0284_text_document +0.0002855895211981585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0285_text_document +0.00027968347097626246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0286_text_document +0.0002810783462604979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0287_text_document +0.00027794080455729715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0288_text_document +0.00034784376461416953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0289_text_document +0.0003488347959010943 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0290_text_document +0.00034790583710250724 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0291_text_document +0.000345913166618151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0292_text_document +0.00033801936268066675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0293_text_document +0.0003290591130212315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0294_text_document +0.00034051399521366823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0295_text_document +0.00032470943131841784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0296_text_document +0.00031679540050914276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0297_text_document +0.00031814596342422325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0298_text_document +0.0003156466289485036 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0299_text_document +0.00029985010879003633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0300_text_document +0.0002905176377776361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0301_text_document +0.0004206836775460856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0302_text_document +0.00020660449162246918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0303_text_document +0.0003461727254468087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0304_text_document +0.00020592870907067763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0305_text_document +0.00034173505299233005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0306_text_document +0.0004052437256652738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0307_text_document +0.0004080650901351697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0308_text_document +0.00039778184149144276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0309_text_document +0.00039046311464950275 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0310_text_document +0.00039043444911071384 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0311_text_document +0.000388575704932843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0312_text_document +0.00019737533145666597 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0313_text_document +0.00037610755595812403 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0314_text_document +0.00037315400127598317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0315_text_document +0.00037415028580922163 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0316_text_document +0.00036694041707212337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0317_text_document +0.00018947219857306515 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0318_text_document +0.00037046050826533545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0319_text_document +0.0003587440768559087 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0320_text_document +0.00034623936498708903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0321_text_document +0.0003502289592617922 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0322_text_document +0.00034692398063649823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0323_text_document +0.000339340809421849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0324_text_document +0.0003360510394816983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0325_text_document +0.0003354673850814145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0326_text_document +0.00032937682875877047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0327_text_document +0.00032844505049317715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0328_text_document +0.00028287199339908627 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0329_text_document +0.0002795217197003578 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0330_text_document +0.00028048955601883463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0331_text_document +0.0002769326396439027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0332_text_document +0.0002727090021299243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0333_text_document +0.0002726577841024554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0334_text_document +0.00026663619593455374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0335_text_document +0.00026068042672138127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0336_text_document +0.0002637704114326801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0337_text_document +0.0002593043567100412 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0338_text_document +0.0002599897110113453 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0339_text_document +0.0002435078682758859 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0340_text_document +0.0002450530071379054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0341_text_document +0.00024233331983743606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0342_text_document +0.0002934750947999535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0343_text_document +0.00033241226364044474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0344_text_document +0.00032938406090272075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0345_text_document +0.00032778705403953246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0346_text_document +0.00032184551480398754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0347_text_document +0.00031874002264945737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0348_text_document +0.0003165319685666433 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0349_text_document +0.00031307071173376295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0350_text_document +0.00031119524184911957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0351_text_document +0.0003102253344576429 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0352_text_document +0.0003088976240383192 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0353_text_document +0.0002951410823077708 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0354_text_document +0.00029772657676757413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0355_text_document +0.0003056048989909935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0356_text_document +0.00031991305381648026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0357_text_document +0.00030890256978362426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0358_text_document +0.0003109382904091933 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0359_text_document +0.00031035798529690644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0360_text_document +0.00030741666395911753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0361_text_document +0.0002989918594861846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0362_text_document +0.00029569635443989434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0363_text_document +0.0002973992445667285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0364_text_document +0.000293397351001072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0365_text_document +0.00028737817438047954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0366_text_document +0.00028252738144009747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0367_text_document +0.0002805511898623541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0368_text_document +0.0003718020784620472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0369_text_document +0.0003499713845765235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0370_text_document +0.00034283547445326676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0371_text_document +0.00031464759888838765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0372_text_document +0.00033188946446414833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0373_text_document +0.000326084432195463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0374_text_document +0.0003764568303917893 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0375_text_document +0.0003604955598858414 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0376_text_document +0.0003655654554133222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0377_text_document +0.00035762304033750504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0378_text_document +0.00038478883950347103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_middle-0379_text_document +0.00027735714341247454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0000_text_document +0.00028139534607773563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0001_text_document +0.00019777292251713763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0002_text_document +0.000285571704874486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0003_text_document +0.00028543482146244363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0004_text_document +0.00019434234484256758 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0005_text_document +0.00027854908176986763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0006_text_document +0.0002847068039566143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0007_text_document +0.00028672356943064853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0008_text_document +0.00027782687605808177 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0009_text_document +0.0002843539634105203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0010_text_document +0.0002894748379090401 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0011_text_document +0.0002868852440186493 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0012_text_document +0.0002818504885373851 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0013_text_document +0.00028680112812941034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0014_text_document +0.00019258978168723977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0015_text_document +0.00028760637934715155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0016_text_document +0.0002820439443912918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0017_text_document +0.0002831001054410018 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0018_text_document +0.00029001901552467397 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0019_text_document +0.00027779449377883156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0020_text_document +0.00019949837437516796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0021_text_document +0.0002907306472984446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0022_text_document +0.00027814858381318327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0023_text_document +0.00019472790889161432 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0024_text_document +0.00020472626596924125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0025_text_document +0.0002870045081974301 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0026_text_document +0.00019812241927078482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0027_text_document +0.0002817553333369554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0028_text_document +0.00027829782796642117 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0029_text_document +0.00028289431732284113 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0030_text_document +0.0002795526296717729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0031_text_document +0.00027682829988044574 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0032_text_document +0.0002895432402719184 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0033_text_document +0.0002823174903941811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0034_text_document +0.00028170972351837796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0035_text_document +0.00027807915877838826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0036_text_document +0.00028588515681452956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0037_text_document +0.00028112324090816726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0038_text_document +0.00020636178289985485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0039_text_document +0.00019447255290980535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0040_text_document +0.0002850824220591452 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0041_text_document +0.00027856429520116784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0042_text_document +0.0002820880676635633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0043_text_document +0.00028943902215995714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0044_text_document +0.0002676366291085329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0045_text_document +0.00023806333809954687 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0046_text_document +0.00024526460430233455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0047_text_document +0.00023876876664622726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0048_text_document +0.00023379770334179805 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0049_text_document +0.00024175151269138382 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0050_text_document +0.00023386583242595706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0051_text_document +0.00023771797150160827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0052_text_document +0.0002262748967483896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0053_text_document +0.0002408148346432682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0054_text_document +0.00023398651720444235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0055_text_document +0.00022989433874474592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0056_text_document +0.00023948500543957772 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0057_text_document +0.0002331594076859196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0058_text_document +0.00023375132439600242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0059_text_document +0.00023923410909668642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0060_text_document +0.00023952796315562954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0061_text_document +0.0002327466076905069 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0062_text_document +0.00023082758956797212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0063_text_document +0.0002240509275524448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0064_text_document +0.00022798879995765268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0065_text_document +0.000221172516774386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0066_text_document +0.00021767045123534623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0067_text_document +0.00021982832794804484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0068_text_document +0.00021971626543789102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0069_text_document +0.00022566565206920132 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0070_text_document +0.0002181984894194856 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0071_text_document +0.00021831417549554653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0072_text_document +0.00021601405421187145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0073_text_document +0.00022275733725519607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0074_text_document +0.00021847734911973986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0075_text_document +0.0002243591012664014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0076_text_document +0.00021688758139483833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0077_text_document +0.0002182953624789215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0078_text_document +0.00020475155724026002 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0079_text_document +0.00021498078062960065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0080_text_document +0.0002157914337233064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0081_text_document +0.00021781838494967963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0082_text_document +0.00021723242266814558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0083_text_document +0.0002176782686553837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0084_text_document +0.0003486179404943968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0085_text_document +0.00034882846352857634 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0086_text_document +0.00031400868448352596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0087_text_document +0.00030273484020011963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0088_text_document +0.00029895889118145404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0089_text_document +0.00029770764609621714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0090_text_document +0.0002990181332116852 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0091_text_document +0.00029653733972285996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0092_text_document +0.00029624649222942476 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0093_text_document +0.00029625609720203576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0094_text_document +0.00029731928930852147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0095_text_document +0.00029011721326148513 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0096_text_document +0.00028849788197494655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0097_text_document +0.00021601278623858145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0098_text_document +0.00021319599281739178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0099_text_document +0.0002153325290600083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0100_text_document +0.00018566946174516558 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0101_text_document +0.00020736824394291617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0102_text_document +0.00020857419820128004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0103_text_document +0.00020058526129536423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0104_text_document +0.00020745812166665217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0105_text_document +0.00020652171015271702 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0106_text_document +0.00020643808911278608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0107_text_document +0.00020040513914482103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0108_text_document +0.00020598050188272898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0109_text_document +0.0001969184139343296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0110_text_document +0.0001972748812937012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0111_text_document +0.0002038556751586195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0112_text_document +0.00020245186011313464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0113_text_document +0.00019950381422038783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0114_text_document +0.00020837055459665258 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0115_text_document +0.00020371856218246096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0116_text_document +0.00019537612301625791 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0117_text_document +0.00019914984508813857 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0118_text_document +0.0002053787713691309 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0119_text_document +0.00019082100541008637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0120_text_document +0.00020397153334531813 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0121_text_document +0.0002021462693077317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0122_text_document +0.00019609357008124035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0123_text_document +0.00019693256622486236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0124_text_document +0.00020007239732428112 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0125_text_document +0.00020467075741591954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0126_text_document +0.00019584883400022932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0127_text_document +0.00019135050391176972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0128_text_document +0.0003362829834208298 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0129_text_document +0.00034013691154784095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0130_text_document +0.00033215887031941976 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0131_text_document +0.00032681189065396707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0132_text_document +0.0003149138485493094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0133_text_document +0.00030179177307540077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0134_text_document +0.0002923278437581119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0135_text_document +0.00029470052278994486 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0136_text_document +0.0002994095093045731 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0137_text_document +0.00029033525096085037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0138_text_document +0.00029390798852496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0139_text_document +0.0002916230924130842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0140_text_document +0.00029419886374594913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0141_text_document +0.0002865469756730764 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0142_text_document +0.00021191292549942086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0143_text_document +0.00021369664817409847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0144_text_document +0.00021612485624266726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0145_text_document +0.00022242192634588478 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0146_text_document +0.00014605095659989698 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0147_text_document +0.00022070626106341693 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0148_text_document +0.0002174420774054071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0149_text_document +0.00021325858963116995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0150_text_document +0.0002124322999488052 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0151_text_document +0.0002081218896969054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0152_text_document +0.0002108710211556957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0154_text_document +0.00020686867095978426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0155_text_document +0.00020895752681041895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0156_text_document +0.00020741922266415738 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0157_text_document +0.0002069112657197308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0158_text_document +0.00020644627473468118 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0159_text_document +0.00020332991338121604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0160_text_document +0.0003560895677789848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0161_text_document +0.00032915779111908214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0162_text_document +0.00033810613317040864 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0163_text_document +0.00033729626594036923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0164_text_document +0.00033550342864602944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0165_text_document +0.00034173474024556906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0166_text_document +0.000331505340748827 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0167_text_document +0.0003270050330117195 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0168_text_document +0.00032585275329172556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0169_text_document +0.0003143383203190604 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0170_text_document +0.00031655199110388894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0171_text_document +0.00030738872158476413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0172_text_document +0.00030838388352699285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0173_text_document +0.0003053596995351888 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0174_text_document +0.00031836304739584593 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0175_text_document +0.000315315435873905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0176_text_document +0.0003087116248965243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0177_text_document +0.00030396790625537645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0178_text_document +0.0003335812246032149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0179_text_document +0.00034570956323095843 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0180_text_document +0.00034563035636675786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0181_text_document +0.00033411265479076335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0182_text_document +0.00034439191141692787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0183_text_document +0.0003364483125496565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0184_text_document +0.0003299500453608033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0185_text_document +0.00033163377700074837 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0186_text_document +0.00032638649660627673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0187_text_document +0.00032616167939645234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0188_text_document +0.0003205289298760723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0189_text_document +0.00031939393740815355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0190_text_document +0.00031593164066731296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0191_text_document +0.00031928871111254405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0192_text_document +0.00029670189073175004 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0193_text_document +0.00020517703846735904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0194_text_document +0.00020128418186172073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0195_text_document +0.00019662723895606717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0196_text_document +0.0001981157042081407 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0197_text_document +0.00019703489037041608 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0198_text_document +0.00019079796331785068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0199_text_document +0.0001909352306690079 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0200_text_document +0.00018824662295261396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0201_text_document +0.00019864275319325954 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0202_text_document +0.00018818516521649587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0203_text_document +0.00018875694972812844 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0204_text_document +0.00018231621170645482 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0205_text_document +0.00018349407845798273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0206_text_document +0.00018088971427746906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0207_text_document +0.00018296284236327237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0208_text_document +0.0001876011825819916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0209_text_document +0.000329052068725176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0210_text_document +0.00032223616273648536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0211_text_document +0.00031272564089633955 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0212_text_document +0.00031621609908414494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0213_text_document +0.0003117213560911235 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0214_text_document +0.00030218064069945934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0215_text_document +0.00030658916600512085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0216_text_document +0.0002915863534115821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0217_text_document +0.0002940280138374372 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0218_text_document +0.00029067860468866085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0219_text_document +0.00028529228063135635 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0220_text_document +0.00028336893301452256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0221_text_document +0.0002794668089130099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0222_text_document +0.00021681361378827842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0223_text_document +0.0001484664674497246 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0224_text_document +0.00021950558378215133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0225_text_document +0.00021806860758808645 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0226_text_document +0.00021819568718852282 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0227_text_document +0.00021626925931585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0228_text_document +0.0001464536143077762 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0229_text_document +0.00021432777088808917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0230_text_document +0.000213473805865147 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0231_text_document +0.00021397067253964538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0232_text_document +0.00020758957647437263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0233_text_document +0.00020687124337683314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0234_text_document +0.00020630057046511005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0235_text_document +0.0002091166859352538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0236_text_document +0.00020777355025615267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0237_text_document +0.00020709287641496176 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0238_text_document +0.00020736464660577094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0239_text_document +0.00020062246741862607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0240_text_document +0.00020693207561942915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0241_text_document +0.00021151004871893024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0242_text_document +0.00019930249098689716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0243_text_document +0.00021589710041231824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0244_text_document +0.00021369204789905741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0245_text_document +0.0002147099923936778 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0246_text_document +0.00021077531190389536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0247_text_document +0.0002100509829113836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0248_text_document +0.00021185362601571124 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0249_text_document +0.00020722136637339565 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0250_text_document +0.00020300093701169531 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0251_text_document +0.00019859737993313477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0252_text_document +0.00019971314372100164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0253_text_document +0.00019549908270269278 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0254_text_document +0.00019649820843534028 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0255_text_document +0.00019619415513498067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0256_text_document +0.00019493006120377898 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0257_text_document +0.00019499409035775506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0258_text_document +0.00019252988593634277 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0259_text_document +0.00019440768268686405 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0260_text_document +0.00018747161324755577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0261_text_document +0.0001879575932372779 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0262_text_document +0.00019040707058357506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0263_text_document +0.0001871931095090703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0264_text_document +0.00020112966223017096 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0265_text_document +0.00020516878165311017 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0266_text_document +0.00020664735191740533 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0267_text_document +0.00021041398572882962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0268_text_document +0.00020397992929690396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0269_text_document +0.0002039978580295561 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0270_text_document +0.00020592785601142126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0271_text_document +0.0001990755527445265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0272_text_document +0.00019729564847798732 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0273_text_document +0.00019958182230527032 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0274_text_document +0.0001985037302636386 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0275_text_document +0.00020204130355115716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0276_text_document +0.0002000296401958085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0277_text_document +0.0001983064832295463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0278_text_document +0.00019663108484195617 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0279_text_document +0.00019510678560556523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0280_text_document +0.0001873284057063206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0281_text_document +0.00019311553072495885 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0282_text_document +0.00034652137288816547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0283_text_document +0.0002813690318850024 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0284_text_document +0.00027697649713138685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0285_text_document +0.0002755419092534421 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0286_text_document +0.0002681583054440219 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0287_text_document +0.00026945753192750824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0288_text_document +0.00026169470768245737 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0289_text_document +0.00026437008960810825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0290_text_document +0.0002637294838228 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0291_text_document +0.00026491867965088836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0292_text_document +0.00025504483625138986 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0293_text_document +0.0002545040623796586 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0294_text_document +0.0002546682814073622 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0295_text_document +0.00025545439487142615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0296_text_document +0.0002626896557978271 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0297_text_document +0.00025092040940402784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0298_text_document +0.0002589154885863872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0299_text_document +0.00024106160482721467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0300_text_document +0.0002483289690087987 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0301_text_document +0.0002388930282784437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0302_text_document +0.00024006340759273874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0303_text_document +0.00023765248178029045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0304_text_document +0.00023061351965578936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0305_text_document +0.00024954224883546477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0306_text_document +0.00017861017233018525 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0307_text_document +0.00017810832743667658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0308_text_document +0.00017599709170759497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0309_text_document +0.00017462723516505223 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0310_text_document +0.0002906316527068669 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0311_text_document +0.00033762141066247166 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0312_text_document +0.00017170670574152494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0313_text_document +0.00017258674515137717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0314_text_document +0.0002815386173173926 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0315_text_document +0.0002996845935618989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0316_text_document +0.0002735268488987296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0317_text_document +0.0002971738713071517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0318_text_document +0.0002942690674002763 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0319_text_document +0.0003322222207729567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0320_text_document +0.0003378721656198464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0321_text_document +0.00018307262621851067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0322_text_document +0.00033956081502775057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0323_text_document +0.00031604820927876276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0324_text_document +0.00028805657681088917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0325_text_document +0.00026312293321215633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0326_text_document +0.00034366936722921455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0327_text_document +0.0002865256504406559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0328_text_document +0.0003063615195861786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0329_text_document +0.00028412791619666136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0330_text_document +0.00028060835132727154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0331_text_document +0.00032544974761560506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0332_text_document +0.0002647177833217225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0333_text_document +0.0003152621884896575 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0334_text_document +0.0003054625140336913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0335_text_document +0.00031183308312292263 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0336_text_document +0.00018175026696621178 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0337_text_document +0.00017699918328872 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0338_text_document +0.00018222339261441908 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0339_text_document +0.00018348005930964137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0340_text_document +0.0001810735993810541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0341_text_document +0.00030846441282038914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0342_text_document +0.0002972326889310354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0343_text_document +0.00017433421318235594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0344_text_document +0.00032799458649525895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0345_text_document +0.00032482130048512673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0346_text_document +0.00031943465668672475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0347_text_document +0.00029615593630484517 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0348_text_document +0.0002893126939511001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0349_text_document +0.0002849288351723284 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0350_text_document +0.00028383906633569267 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0351_text_document +0.00028072526091262615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0352_text_document +0.000284239564292377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0353_text_document +0.0002778903109432523 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0354_text_document +0.0002771644389501471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0355_text_document +0.0002733316182319337 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0356_text_document +0.00026362539185869363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0357_text_document +0.0002636325383220217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0358_text_document +0.00026740622442302886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0359_text_document +0.0002646771971853427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0360_text_document +0.0002628566720605389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0361_text_document +0.0002644760695434766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0362_text_document +0.0002623837702310999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0363_text_document +0.00026088722976772894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0364_text_document +0.0002567065374799158 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0365_text_document +0.00018857382101207726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0366_text_document +0.00019036580399817203 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0367_text_document +0.00018348828065261222 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0368_text_document +0.00018491851780345073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0369_text_document +0.00018904887260080187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0370_text_document +0.0001875609304251801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0371_text_document +0.00018393034720015817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0372_text_document +0.00018419795526114903 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0373_text_document +0.00018699955623404795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0374_text_document +0.00018276256902965128 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0375_text_document +0.00017698045695190812 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0376_text_document +0.00018104650132303642 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0377_text_document +0.00017758206731279688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0378_text_document +0.00017131402995103497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0379_text_document +0.000175944428350446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0380_text_document +0.0003416745727147391 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0381_text_document +0.0003163259373952889 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0382_text_document +0.0002804489269172448 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0383_text_document +0.00028748272397403175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0384_text_document +0.00027603318345630605 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0385_text_document +0.000271638824679648 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0386_text_document +0.0002763761210210942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0387_text_document +0.00026501984873172717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0388_text_document +0.00026422486894694714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0389_text_document +0.0002686339100849262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0390_text_document +0.0002610837453940606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0391_text_document +0.000260974343729353 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0392_text_document +0.0002599403837029134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0393_text_document +0.0002937273113238609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0394_text_document +0.0003341790732600504 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0395_text_document +0.0002620661576600244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0396_text_document +0.0003027929169239288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0397_text_document +0.00031944039129326894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0398_text_document +0.00019025676304139009 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0399_text_document +0.00018680910145009907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0400_text_document +0.00034215840419416437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0401_text_document +0.00018618120812119364 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0402_text_document +0.00018605853095599425 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0403_text_document +0.00018120712626096538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0404_text_document +0.00018315079292495327 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0405_text_document +0.00018362556449041974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0406_text_document +0.0001780024456718171 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0407_text_document +0.00033296526436178697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0408_text_document +0.0001802398632282846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0409_text_document +0.00017340263100798256 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0410_text_document +0.00017755840547238697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0411_text_document +0.00018419413735260606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0412_text_document +0.00017869518174591322 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0413_text_document +0.00017526271460129484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0414_text_document +0.00017852168597981907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0415_text_document +0.00017566536156787157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0416_text_document +0.00017589867964432936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0417_text_document +0.00017831487394075305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0418_text_document +0.00017837310528935862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0419_text_document +0.00018200908814216548 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0420_text_document +0.0001795136627511612 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0421_text_document +0.0003414021775300033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0422_text_document +0.00017177291787788502 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0423_text_document +0.0003441900648571877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0424_text_document +0.0003394534597060673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0425_text_document +0.0003236887233114832 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0426_text_document +0.0001639544129688747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0427_text_document +0.00019137443753211255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0428_text_document +0.00018575146284680153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0429_text_document +0.00019184792863440243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0430_text_document +0.00018966043065679055 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0431_text_document +0.00017968851317035848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0432_text_document +0.00018479881897661546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0433_text_document +0.0001813642692683015 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0434_text_document +0.0001686449798983066 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0435_text_document +0.00018516104592230446 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0436_text_document +0.00031283726601066385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0437_text_document +0.0003248607542883853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0438_text_document +0.00031583241601202365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0439_text_document +0.00031238270857730376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0440_text_document +0.000307150592403979 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0441_text_document +0.00029443829986847044 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0442_text_document +0.0002942723732234677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0443_text_document +0.00023514930666443422 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_en_tail-0444_text_document +0.0020776328951453444 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0000_text_document +0.0021768234410538883 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0001_text_document +0.002106973549276289 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0002_text_document +0.002110915756171751 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0003_text_document +0.0017032382109816464 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_head-0004_text_document +0.0019047944877712286 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0000_text_document +0.0019402711744016077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0001_text_document +0.0006264790011223686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_middle-0002_text_document +0.0017885401938106643 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/cc_news_tail-0000_text_document +0.0003547982093445404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0000_text_document +0.00035934014428504944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0001_text_document +0.00035707704501371544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0002_text_document +0.00035287930712815354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0003_text_document +0.00035977166728996823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0004_text_document +0.0003581675664109838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0005_text_document +0.0003548617059697185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0006_text_document +0.0003639582000286208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0007_text_document +0.00035375839698688127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0008_text_document +0.0003743722020080678 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0009_text_document +0.0003530399715341242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0010_text_document +0.00035511875882752406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0011_text_document +0.0003618733574783154 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0012_text_document +0.00035185243285420104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0013_text_document +0.0003541503739732106 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0014_text_document +0.0003631679485751914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0015_text_document +0.00035748045578182274 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0016_text_document +0.0003606490690555877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0017_text_document +0.0003626383296610091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0018_text_document +0.00035442644361264756 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0019_text_document +0.00035978370170539796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0020_text_document +0.0003585562375341541 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0021_text_document +0.0003601958372888019 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0022_text_document +0.000350277765402227 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0023_text_document +0.0003616521184211704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0024_text_document +0.0003620625543608188 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0025_text_document +0.0003560781983850704 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0026_text_document +0.0003553209610592676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0027_text_document +0.00035905348643915075 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0028_text_document +0.00034744258805696526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0029_text_document +0.00035462784035661496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0030_text_document +0.00034768186175100895 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0031_text_document +0.0003568534635532736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0032_text_document +0.00035586511544371234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0033_text_document +0.0003524567827568137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0034_text_document +0.0003512453770426313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0035_text_document +0.0003591792726468799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0036_text_document +0.0003514024529343127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0037_text_document +0.0003584880112586934 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0038_text_document +0.00035133552916418045 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0039_text_document +0.0003600811981350215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0040_text_document +0.0003571663974228119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0041_text_document +0.00035768103378874214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0042_text_document +0.00035939205561113694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0043_text_document +0.00035186773916029825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0044_text_document +0.0003542829672490847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0045_text_document +0.0003592783642898726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0046_text_document +0.0003556367340099302 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0047_text_document +0.00035391392271377027 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0048_text_document +0.00035486725707484836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0049_text_document +0.00034866743396828035 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0050_text_document +0.0003517219808644735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0051_text_document +0.00034874458549673823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0052_text_document +0.000355773136961014 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0053_text_document +0.00035611750387841917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0054_text_document +0.00035305602013916315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0055_text_document +0.0003578207127071924 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0056_text_document +0.00035514635841943707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0057_text_document +0.00034816946212866206 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0058_text_document +0.0003512707269761496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0059_text_document +0.0003483392117980654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0060_text_document +0.0003572169607204321 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0061_text_document +0.00035139153281660794 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0062_text_document +0.00035536422129036537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0063_text_document +0.000352017164107143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0064_text_document +0.000351889550179365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0065_text_document +0.000358759689953589 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0066_text_document +0.0003569286079869268 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0067_text_document +0.0003657752958602099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0068_text_document +0.00035396127934790697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0069_text_document +0.0003618565071224743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0070_text_document +0.00035146051531973204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0071_text_document +0.00036107135765783567 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0072_text_document +0.00035019554279994576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0073_text_document +0.00035567858879904983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0074_text_document +0.0003504753174793183 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0075_text_document +0.00035931140831329194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0076_text_document +0.0003502967866002823 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0077_text_document +0.0003532911801041972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0078_text_document +0.0003583543013070199 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0079_text_document +0.0003566243489931224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0080_text_document +0.0003468752314799221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0081_text_document +0.0003597840618138091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0082_text_document +0.00035128822484768084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0083_text_document +0.00035889496943437507 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0084_text_document +0.000352400524650424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0085_text_document +0.0003518689536768735 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0086_text_document +0.00035866864741303467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0087_text_document +0.0003454687659106334 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0088_text_document +0.00035348007259317576 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0089_text_document +0.0003539752270940644 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0090_text_document +0.00035146495994081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0091_text_document +0.00035397212846310423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0092_text_document +0.00035208246467162587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0093_text_document +0.0003490843168676626 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0094_text_document +0.00035299633658644394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0095_text_document +0.00034868327466167065 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0096_text_document +0.00035941351365601583 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0097_text_document +0.0003545343062735255 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0098_text_document +0.0003528956380445978 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0099_text_document +0.0003553355770443352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0100_text_document +0.0003644224004937743 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0101_text_document +0.00035234291036216907 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0102_text_document +0.0003596237469847771 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0103_text_document +0.0003531996065735989 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0104_text_document +0.0003547177054106099 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0105_text_document +0.0003575586499260483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0106_text_document +0.00035262635135283667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0107_text_document +0.0003624191962188944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0108_text_document +0.0003488398052948616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0109_text_document +0.0003598294093147917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0110_text_document +0.00035583006534466323 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0111_text_document +0.00035403139653225103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0112_text_document +0.00036134702642187156 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0113_text_document +0.0003573689927162834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0114_text_document +0.0003577141131435527 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0115_text_document +0.00035208814419277406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0116_text_document +0.00035996720683665625 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0117_text_document +0.00035415304658912596 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0118_text_document +0.00036353353029443546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0119_text_document +0.0003537326003150983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0120_text_document +0.00036053976358299083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0121_text_document +0.000352380489373494 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0122_text_document +0.00036154661616900994 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0123_text_document +0.00035959332325963614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0124_text_document +0.0003597954667189692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0125_text_document +0.0003563108270597542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0126_text_document +0.0003582891940460143 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0127_text_document +0.0003497728210484297 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0128_text_document +0.0003549834902179354 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0129_text_document +0.0003529828233484542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0130_text_document +0.00034627483903285777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0131_text_document +0.00035569006572589215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0132_text_document +0.00035449377946910314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0133_text_document +0.00035802844396194623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0134_text_document +0.0003617277809353208 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0135_text_document +0.00035034118898654814 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0136_text_document +0.000351091193908611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0137_text_document +0.0003527914342210668 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0138_text_document +0.00035028288369781376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0139_text_document +0.00035775745592780506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0140_text_document +0.0003449630690661468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0141_text_document +0.0003583490698830361 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0142_text_document +0.0003476995746684122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0143_text_document +0.0003535632505019212 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0144_text_document +0.00035640180641147417 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0145_text_document +0.000361731045691765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0146_text_document +0.0003534082129597368 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0147_text_document +0.0003550344149828664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0148_text_document +0.00035363002411364057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0149_text_document +0.0003537265579677396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0150_text_document +0.00034950531383577937 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0151_text_document +0.00035008511827347514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0152_text_document +0.00035594533400871325 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0153_text_document +0.00035266312861335946 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0154_text_document +0.00035280268794863923 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0155_text_document +0.0003565470391528536 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0156_text_document +0.0003588492322689137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0157_text_document +0.00035469909697832775 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0158_text_document +0.00034712082813410526 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0159_text_document +0.000348701157101807 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0160_text_document +0.0003500192014479944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0161_text_document +0.00035120560544669755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0162_text_document +0.00035403656850437445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0163_text_document +0.00035852376560749366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0164_text_document +0.0003534754068111774 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0165_text_document +0.00035591740046720765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0166_text_document +0.000348522354782563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0167_text_document +0.0003533533959664415 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0168_text_document +0.00035631425964030697 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0169_text_document +0.0003485886551574741 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0170_text_document +0.00035917652631065777 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0171_text_document +0.0003482975272111288 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0172_text_document +0.00035580661277480167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0173_text_document +0.0003492290722955348 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0174_text_document +0.00034989284450240613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0175_text_document +0.0003545677216162781 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0176_text_document +0.00034622286859463484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0177_text_document +0.00036070626989861965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0178_text_document +0.00035518365036320786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0179_text_document +0.00035272907057848406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0180_text_document +0.0003547343638218734 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0181_text_document +0.0003496450144966242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0182_text_document +0.0003537407829294287 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0183_text_document +0.0003489722653985685 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0184_text_document +0.00035057186899911295 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0185_text_document +0.0003507566548933051 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0186_text_document +0.00035630360179023747 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0187_text_document +0.00035631362503416367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0188_text_document +0.0003490204248026821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0189_text_document +0.00035761724058371226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0190_text_document +0.00035037664777467137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0191_text_document +0.000353402110481068 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0192_text_document +0.00034524163568371745 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0193_text_document +0.00035528523728570974 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0194_text_document +0.00034784916132431703 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0195_text_document +0.00034928476408048925 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0196_text_document +0.00034989205973784984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0197_text_document +0.00034201664404094254 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0198_text_document +0.0003529676016338611 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0199_text_document +0.00034643433682346637 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0200_text_document +0.0003511666373001904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0201_text_document +0.00034828669066575333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0202_text_document +0.0003494625207264413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0203_text_document +0.0003458957535879216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0204_text_document +0.0003543020478990003 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0205_text_document +0.00034754384069014956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0206_text_document +0.0003598856392240133 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0207_text_document +0.0003503335458553846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0208_text_document +0.00035919595619778716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0209_text_document +0.00035767737970754404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0210_text_document +0.00035197152783998165 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0211_text_document +0.0003549609834422404 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0212_text_document +0.0003568184100569753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0213_text_document +0.0003512652818651935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0214_text_document +0.00035912648958665754 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0215_text_document +0.00034764526964056546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0216_text_document +0.000352439784960359 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0217_text_document +0.00035295886560764226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0218_text_document +0.0003518132693658672 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0219_text_document +0.00035589987915465713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0220_text_document +0.00034923863317385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0221_text_document +0.0003457987267929692 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0222_text_document +0.0003560928663480501 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0223_text_document +0.0003529603811204932 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0224_text_document +0.0003524438555443043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0225_text_document +0.0003438847030263783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0226_text_document +0.00035981978898461613 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0227_text_document +0.0003446342778566972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0228_text_document +0.00035529584995236537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0229_text_document +0.00034855740895831116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0230_text_document +0.00034932634912802544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0231_text_document +0.00035805518303064666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0232_text_document +0.0003497941877073061 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0233_text_document +0.00035774398685405447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0234_text_document +0.0003560421780316607 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0235_text_document +0.0003508844468369392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0236_text_document +0.00035731928892270107 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0237_text_document +0.0003557884626314314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0238_text_document +0.00034992996760289355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0239_text_document +0.000360752554360921 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0240_text_document +0.0003452321668708545 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0241_text_document +0.0003591745226131023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0242_text_document +0.00035256981433229084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0243_text_document +0.00035378123159712034 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0244_text_document +0.000350464354895999 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0245_text_document +0.00035074625557389677 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0246_text_document +0.00035025894701994667 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0247_text_document +0.00035437902514857614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0248_text_document +0.0003514684519732232 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0249_text_document +0.00035449717909633905 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0250_text_document +0.0003436816402714221 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0251_text_document +0.00035139158071782116 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0252_text_document +0.0003509424079843335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0253_text_document +0.000343894618577506 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0254_text_document +0.0003500789770661659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0255_text_document +0.0003407788080680086 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0256_text_document +0.0003581908175239701 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0257_text_document +0.0003465541618780918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0258_text_document +0.00034600228792437736 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0259_text_document +0.00034416738982773204 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0260_text_document +0.0003519900340150641 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0261_text_document +0.000343369616864659 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0262_text_document +0.0003544993883274688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0263_text_document +0.0003504441365073392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0264_text_document +0.00034859160702727056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0265_text_document +0.00035355909532647185 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0266_text_document +0.0003471900922691849 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0267_text_document +0.0003563015508709187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0268_text_document +0.0003487888744148821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0269_text_document +0.00034711767548688336 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0270_text_document +0.0003530734609369085 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0271_text_document +0.00035123969242560935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0272_text_document +0.0003517127620891489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0273_text_document +0.00035232835416868673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0274_text_document +0.0003524437481912308 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0275_text_document +0.0003525996167005602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0276_text_document +0.00035064770545242043 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0277_text_document +0.00035311558274981226 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0278_text_document +0.00034952204800569914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0279_text_document +0.0003541471367344846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0280_text_document +0.00035418812454561825 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0281_text_document +0.0003528951372900714 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0282_text_document +0.0003542338042975688 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0283_text_document +0.00034937738939942796 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0284_text_document +0.0003522182190878447 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0285_text_document +0.0003501406466507449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0286_text_document +0.00034973079877492633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0287_text_document +0.0003485274567713538 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0288_text_document +0.00034999308679368985 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0289_text_document +0.0003570051724707296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0290_text_document +0.00034567230462019706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0291_text_document +0.00035529000940160696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0292_text_document +0.00034956512308671755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0293_text_document +0.0003496962834028953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0294_text_document +0.0003468745282493457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0295_text_document +0.0003502717155809202 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0296_text_document +0.0003556240880896514 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0297_text_document +0.0003515109488424343 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0298_text_document +0.0003563156688192592 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0299_text_document +0.00035040277363989817 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0300_text_document +0.0003481408593290717 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0301_text_document +0.0003624575124332874 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0302_text_document +0.0003522684124250313 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0303_text_document +0.00035286996027653544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0304_text_document +0.00034967623997256725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0305_text_document +0.00035182649587602765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0306_text_document +0.0003524892557026489 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0307_text_document +0.0003507642477451811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0308_text_document +0.00036190408389835666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0309_text_document +0.00035102739424880766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0310_text_document +0.00035239718753257265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0311_text_document +0.00035298076121821316 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0312_text_document +0.0003478704389752654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0313_text_document +0.0003503109191567942 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0314_text_document +0.00035143250975654426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0315_text_document +0.0003480663923069012 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0316_text_document +0.00035691540219998623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0317_text_document +0.000348815437166351 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0318_text_document +0.00035202073257766225 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0319_text_document +0.0003491569096274706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0320_text_document +0.00035277390475511834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0321_text_document +0.0003524972090026609 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0322_text_document +0.0003504854249750236 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0323_text_document +0.00034740238025423914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0324_text_document +0.00034968015462277606 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0325_text_document +0.0003493798632762674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0326_text_document +0.0003488202537862122 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0327_text_document +0.0003525461864643725 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0328_text_document +0.00034903815232825664 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0329_text_document +0.00035536982539258216 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0330_text_document +0.00034858083265155483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0331_text_document +0.0003505014973608067 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0332_text_document +0.00035327984042622104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0333_text_document +0.0003503286677453136 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0334_text_document +0.00035835274842442816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0335_text_document +0.00034970302660275595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0336_text_document +0.000357929573140149 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0337_text_document +0.0003517238649788585 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0338_text_document +0.00036097027318848475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0339_text_document +0.0003502734074110026 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0340_text_document +0.00035801510806036273 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0341_text_document +0.0003568006373479869 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0342_text_document +0.00036128108717454636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0343_text_document +0.0003563436883111686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0344_text_document +0.00035559725321852463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0345_text_document +0.00035089656006854944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0346_text_document +0.000359453964362057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0347_text_document +0.00035629498059104033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0348_text_document +0.0003622207707090437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0349_text_document +0.0003540946784512821 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0350_text_document +0.0003594750565232011 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0351_text_document +0.0003566007415086991 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0352_text_document +0.0003562142599126134 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0353_text_document +0.0003569948186744601 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0354_text_document +0.00035166554847920186 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0355_text_document +0.00035047994419295137 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0356_text_document +0.0003561578193739437 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0357_text_document +0.00035470866838811544 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0358_text_document +0.00034216920464876335 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0359_text_document +0.0003550021513075795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0360_text_document +0.0003488045105938729 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0361_text_document +0.0003513340720840151 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0362_text_document +0.0003448558566387584 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0363_text_document +0.0003460966026953241 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0364_text_document +0.0003488157616036459 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0365_text_document +0.0003446120387842362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0366_text_document +0.000351528602987427 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0367_text_document +0.00035661118227454713 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0368_text_document +0.0003551342699877457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0369_text_document +0.0003478953397924445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0370_text_document +0.00034625782458988215 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0371_text_document +0.0003527515447405871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0372_text_document +0.00034823744889805696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0373_text_document +0.00034823314560254406 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0374_text_document +0.00035162668292961944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0375_text_document +0.0003477307716074623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0376_text_document +0.0003446457989477787 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0377_text_document +0.00034782916273767795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0378_text_document +0.0003517249130302248 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0379_text_document +0.0003449873430908556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0380_text_document +0.00034841291749669877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0381_text_document +0.0003466028498941749 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0382_text_document +0.0003486436831199424 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0383_text_document +0.0003478279234211838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0384_text_document +0.0003495903653274374 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0385_text_document +0.00034896893881218957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0386_text_document +0.000348941645312426 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0387_text_document +0.0003474221308416894 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0388_text_document +0.0003462621543839385 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0389_text_document +0.0003669373860863891 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0390_text_document +0.00034691156268163006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0391_text_document +0.0003527774103765281 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0392_text_document +0.00034684565672734663 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0393_text_document +0.0003454250599604457 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0394_text_document +0.0003541536557159006 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0395_text_document +0.000345735737037366 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0396_text_document +0.0003524669816385214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0397_text_document +0.0003441817133096468 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0398_text_document +0.0003519093265859089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0399_text_document +0.00035080085480352095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0400_text_document +0.00035285227929327434 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0401_text_document +0.00034354836346901676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0402_text_document +0.00034789770937373467 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0403_text_document +0.000343665920520102 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0404_text_document +0.0003490884931060568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0405_text_document +0.00034380029463398654 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0406_text_document +0.00034874768005099945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0407_text_document +0.0003457058510967673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0408_text_document +0.00034644265227023904 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0409_text_document +0.00035008339858594957 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0410_text_document +0.0003462377193296194 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0411_text_document +0.0003620491787114201 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0412_text_document +0.000348717011044469 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0413_text_document +0.00034370072363913706 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0414_text_document +0.0003551981066775649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0415_text_document +0.0003500119496799342 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0416_text_document +0.0003485082952669081 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0417_text_document +0.0003508155580978919 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0418_text_document +0.00035311375163251416 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0419_text_document +0.00034945972003423253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0420_text_document +0.0003474220353789879 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0421_text_document +0.0003536443686585001 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0422_text_document +0.0003560350489042953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0423_text_document +0.0003493655927914396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0424_text_document +0.0003528423977146383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0425_text_document +0.00035255554724471217 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0426_text_document +0.0003479760010190111 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0427_text_document +0.00035458598862501956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0428_text_document +0.0003458990560538315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0429_text_document +0.00035157946422379875 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0430_text_document +0.00034736860650169996 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0431_text_document +0.0003529152313394119 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0432_text_document +0.00034586294329524465 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0433_text_document +0.00035707214923794877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0434_text_document +0.0003509580363496512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0435_text_document +0.00035244176725524474 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0436_text_document +0.0003467539557999047 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0437_text_document +0.00034919687962275546 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0438_text_document +0.00035094031731719953 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0439_text_document +0.0003484309008351352 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0440_text_document +0.0003485409424916253 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0441_text_document +0.0003499590776117838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0442_text_document +0.0003492842758957848 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0443_text_document +0.0003529712275178912 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0444_text_document +0.0003566141287087449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0445_text_document +0.0003649496522047409 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0446_text_document +0.0003563218912208234 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0447_text_document +0.00035614782126966145 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0448_text_document +0.0003531944298453266 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0449_text_document +0.0003535950949566616 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0450_text_document +0.0003544295554928795 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0451_text_document +0.0003519908503740376 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0452_text_document +0.00035752817626134463 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0453_text_document +0.0003515322689589972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0454_text_document +0.0003486893890307115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0455_text_document +0.0003446520464889867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0456_text_document +0.0003509421562481707 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0457_text_document +0.00035335015702909084 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0458_text_document +0.0003490178167345008 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0459_text_document +0.0003520497821155174 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0460_text_document +0.0003549762618908944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0461_text_document +0.00035072190850833103 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0462_text_document +0.0003542458638526423 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0463_text_document +0.000352419194572916 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0464_text_document +0.0003545102564672614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0465_text_document +0.0003495437992331806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0466_text_document +0.0003542843376993964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0467_text_document +0.000352827529313958 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0468_text_document +0.00035442506093223886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0469_text_document +0.0003496970719044257 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0470_text_document +0.0003553096424442362 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0471_text_document +0.00034986845565067564 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0472_text_document +0.000352131055186658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0473_text_document +0.0003527021708198983 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0474_text_document +0.00034905885414547214 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0475_text_document +0.0003583433842468394 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0476_text_document +0.00034409435202828383 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0477_text_document +0.00034846410520871483 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0478_text_document +0.0003554459991927314 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0479_text_document +0.00035310507471843076 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0480_text_document +0.000350028910786098 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0481_text_document +0.00035049727458009896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0482_text_document +0.0003519047735925826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0483_text_document +0.0003513027429919726 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0484_text_document +0.0003626947260354396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0485_text_document +0.0003500087324849783 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0486_text_document +0.0003618315726725285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0487_text_document +0.0003535385113938023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0488_text_document +0.0003487064058517615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0489_text_document +0.0003618709124780938 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0490_text_document +0.00035040070335625915 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0491_text_document +0.0003506279032267829 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0492_text_document +0.0003498435310527524 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0493_text_document +0.0003554634749821431 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0494_text_document +0.00035091209738758963 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0495_text_document +0.00035034103678978573 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0496_text_document +0.00035398931854386146 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0497_text_document +0.00035495529304989485 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0498_text_document +0.00036067883473356603 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/falcon-0499_text_document +6.322825248625475e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0000_text_document +2.4432314037946264e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0001_text_document +5.6313888721313454e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0002_text_document +2.4208171781595055e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0003_text_document +2.325811856369237e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0004_text_document +2.4010790356322705e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0005_text_document +5.36773610843632e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0006_text_document +1.360574433501002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0007_text_document +1.3076540344853244e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0008_text_document +1.3386534334886313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0009_text_document +1.2498103719605153e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0010_text_document +1.403763836949682e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0011_text_document +1.3636756723495417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0012_text_document +1.2242489446940814e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0013_text_document +1.2398255818973339e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0014_text_document +1.2972616994216281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0015_text_document +1.3947809855914134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0016_text_document +1.3144843787829514e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0017_text_document +1.1693809976572487e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0018_text_document +1.3677252682893802e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0019_text_document +1.3940876719849597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0020_text_document +1.4222245138730965e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0021_text_document +1.3201677767919704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0022_text_document +1.1421717796486169e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0023_text_document +1.2890514724498703e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0024_text_document +1.3649507648749037e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0025_text_document +1.2400732563490717e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0026_text_document +1.1557681453277616e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0027_text_document +1.2294483595964517e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0028_text_document +1.2137484472122283e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0029_text_document +1.3299663426456e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0030_text_document +1.2461984216479532e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0031_text_document +1.4666434217609636e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0032_text_document +1.1876997894686238e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0033_text_document +1.2939155338964078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0034_text_document +1.3859590039728515e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0035_text_document +1.317917848615668e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0036_text_document +1.1335281536110342e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0037_text_document +1.2889923952861426e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0038_text_document +1.3471671647053326e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0039_text_document +1.2221720014475102e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0040_text_document +1.2632647276287541e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0041_text_document +1.28276219004076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0042_text_document +1.36213704321643e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0043_text_document +1.2414858625261553e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0044_text_document +1.3173700421883744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0045_text_document +1.295597796725686e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0046_text_document +1.242783936442904e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0047_text_document +1.2417374088427464e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0048_text_document +1.2134479405400744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0049_text_document +1.3090040663304255e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0050_text_document +1.2713470581614905e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0051_text_document +5.5750231378906594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0052_text_document +5.777597358425469e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0053_text_document +5.349786767471258e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0054_text_document +5.675165050453583e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0055_text_document +5.482611216158831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0056_text_document +5.065421899890121e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0057_text_document +5.384718357480146e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0058_text_document +4.872037363236061e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0059_text_document +4.532709250783155e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0060_text_document +5.7257963030489613e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0061_text_document +4.9014365579652036e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0062_text_document +5.722863552770969e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0063_text_document +6.149911636146833e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0064_text_document +5.2178057608273506e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0065_text_document +4.990228161160431e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0066_text_document +5.866186875255134e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0067_text_document +5.004185734360719e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0068_text_document +4.79401853705107e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0069_text_document +5.435219965052376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0070_text_document +5.035997225792266e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0071_text_document +5.622401774211625e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0072_text_document +5.028826157387559e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0073_text_document +5.596379470128795e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0074_text_document +6.027824493191489e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0075_text_document +5.5358270009931474e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0076_text_document +5.9839051807685496e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0077_text_document +5.1221077499249595e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0078_text_document +5.517228560620279e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0079_text_document +5.1687858285052305e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0080_text_document +5.684188244145645e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0081_text_document +5.212693275535878e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0082_text_document +4.8551007022784084e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0083_text_document +5.4888506639203145e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0084_text_document +5.345098688527242e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0085_text_document +4.8506420625516594e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0086_text_document +5.132168603397676e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0087_text_document +5.719476795114223e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0088_text_document +5.7448621149792696e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0089_text_document +4.9068410568059265e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0090_text_document +5.382937299647678e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0091_text_document +4.8288432136304634e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0092_text_document +5.841703200305416e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0093_text_document +5.1589611587885584e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0094_text_document +6.031113829732574e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0095_text_document +5.4558202844532094e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0096_text_document +5.341852317196142e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0097_text_document +5.1402942738369954e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0098_text_document +5.735421384377395e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0099_text_document +5.473629863586958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0100_text_document +5.4708993245733936e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0101_text_document +4.931161863634078e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0102_text_document +5.104173022127248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0103_text_document +5.510157161510824e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0104_text_document +5.652501401782597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0105_text_document +5.7273656573031666e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0106_text_document +5.638363224821738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0107_text_document +5.6128115396668704e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0108_text_document +5.00304877998141e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0109_text_document +5.596120554779096e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0110_text_document +5.5280923889040006e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0111_text_document +5.223477917938408e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0112_text_document +5.29472809986569e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0113_text_document +2.205682378243213e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0114_text_document +1.4367563720603185e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0115_text_document +3.5506193487931076e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0116_text_document +3.0442910855821778e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0117_text_document +2.2540042508019627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0118_text_document +2.6880163202623216e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0119_text_document +2.534473148048727e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0120_text_document +2.6560945431318916e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0121_text_document +2.547470248967691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0122_text_document +2.5248825388073738e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0123_text_document +2.5828729575000054e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0124_text_document +2.4026583817957736e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0125_text_document +2.3930425429834413e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0126_text_document +2.5037365362599724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0127_text_document +2.6696745470595603e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0128_text_document +2.140323051341762e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0129_text_document +2.617354786691592e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0130_text_document +1.538359101762691e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0131_text_document +1.2871029252377856e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0132_text_document +2.255195411289217e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0133_text_document +2.4832313897952067e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0134_text_document +9.303873918189968e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0135_text_document +2.179532302620228e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0136_text_document +1.9750517506901206e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0137_text_document +2.7740420380648435e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0138_text_document +2.7813714782319335e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0139_text_document +4.1595357937609806e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0140_text_document +2.741365122389175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0141_text_document +2.117451071361901e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0142_text_document +1.7132649760565998e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0143_text_document +1.7492547092602047e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0144_text_document +1.7499951097392276e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0145_text_document +1.6632444789170958e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0146_text_document +1.6678802252361607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0147_text_document +1.5519208704558896e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0148_text_document +1.652420992967167e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0149_text_document +1.6119931034508755e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0150_text_document +1.6638882076736552e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0151_text_document +1.7198076782652946e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0152_text_document +1.572927860565175e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0153_text_document +1.5194822618169918e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0154_text_document +1.6677776832669846e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0155_text_document +1.595612492245688e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0156_text_document +1.682350633181197e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0157_text_document +1.663983380609724e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0158_text_document +1.710187842689243e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0159_text_document +1.5733697527539038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0160_text_document +1.6972104757911438e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0161_text_document +1.6610142847616577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0162_text_document +1.61094882403031e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0163_text_document +1.4789207305138325e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0164_text_document +1.639299617676302e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0165_text_document +1.3241204512116132e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0166_text_document +8.582260726625535e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0167_text_document +8.213000975576739e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0168_text_document +9.549247732811947e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0169_text_document +9.17242785339013e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0170_text_document +7.632868223725218e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0171_text_document +8.674401118222175e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0172_text_document +9.124384255505347e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0173_text_document +8.344222222417358e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0174_text_document +8.992299957499065e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0175_text_document +8.76689497361025e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0176_text_document +7.973396239586015e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0177_text_document +9.006935606644125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0178_text_document +8.725545954955498e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0179_text_document +1.215449694669174e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0180_text_document +3.3041720284158646e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0181_text_document +2.0593512412624502e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0182_text_document +1.893608946986248e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0183_text_document +1.737111666788535e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0184_text_document +1.4915923449873955e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0185_text_document +2.289370239067605e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0186_text_document +2.8615335689614638e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0187_text_document +8.847283630883125e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0188_text_document +1.8175470362373804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0189_text_document +1.8152226683368038e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0190_text_document +1.789149655314284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0191_text_document +1.7690523036477663e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0192_text_document +1.8333732213753644e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0193_text_document +1.8794105687718654e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0194_text_document +1.721841156706417e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0195_text_document +2.0612008685724796e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0196_text_document +1.9297370681336376e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0197_text_document +2.0188440409661018e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0198_text_document +5.1741216329695265e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0199_text_document +1.3417913926038429e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0200_text_document +1.1010813016469651e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0201_text_document +1.1252416134320087e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0202_text_document +1.2801744104313002e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0203_text_document +1.3041514955795817e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0204_text_document +1.3428837580879075e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0205_text_document +1.320809382267804e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0206_text_document +1.3451566676555968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0207_text_document +1.228284926657501e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0208_text_document +1.2410599573923043e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0209_text_document +1.3815343367377182e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0210_text_document +1.3895126265148832e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0211_text_document +1.2306773644401741e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0212_text_document +1.32981021906281e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0213_text_document +1.101337469221607e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0214_text_document +1.513094184404692e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0215_text_document +1.1073759547073234e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0216_text_document +1.2879348765857567e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0217_text_document +9.619595770228435e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0218_text_document +1.2384340836286436e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0219_text_document +1.1766667232211577e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0220_text_document +1.2871049236196452e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0221_text_document +1.2010645926497744e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0222_text_document +1.3971428231518597e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0223_text_document +1.2283733550547932e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0224_text_document +1.2659530508255308e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0225_text_document +1.551775613074462e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0226_text_document +1.1169413343776979e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0227_text_document +1.1433700593712463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0228_text_document +4.964773647323492e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0229_text_document +1.0995586595687313e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0230_text_document +1.2957393071411267e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0231_text_document +2.75899247407709e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0232_text_document +2.8269344597344854e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0233_text_document +2.329108187246831e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0234_text_document +2.4231761430460284e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0235_text_document +1.2434140512230442e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0236_text_document +1.638718338352859e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0237_text_document +3.272953556801187e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0238_text_document +6.061314500486327e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0239_text_document +1.2465979731210292e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0240_text_document +1.2737557327967737e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0241_text_document +1.038428658075627e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0242_text_document +2.61666472045566e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0243_text_document +3.6506873212272224e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0244_text_document +1.5066359138295701e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0245_text_document +1.1166290872121178e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0246_text_document +1.5546966228590285e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0247_text_document +1.2583434625014828e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0248_text_document +1.3398826881300862e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0249_text_document +1.2944933160515968e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0250_text_document +1.0971437399901365e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0251_text_document +1.2787922795775774e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0252_text_document +1.404979227816985e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0253_text_document +1.3344734431324463e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0254_text_document +4.886031157107555e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0255_text_document +3.277261443596394e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0256_text_document +3.5057957685786495e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0257_text_document +3.287625301718589e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0258_text_document +3.1370056372668855e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0259_text_document +3.186092015785841e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0260_text_document +7.271819324142512e-06 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/megawika-0261_text_document +0.001451215788905126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0000_text_document +0.0014486847196258788 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0001_text_document +0.0008861032722895899 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0002_text_document +0.0018119590809459816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0003_text_document +0.0008916937917547129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0004_text_document +6.960128832809415e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0005_text_document +0.002008403651063623 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0006_text_document +0.0014374900742131454 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0007_text_document +0.00180213596996716 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0008_text_document +0.001956178877532413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0009_text_document +0.0008829547017667033 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0010_text_document +0.0008910853619157279 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0011_text_document +0.0018260998845299973 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/open-web-math-train-0012_text_document +0.0012499632072059553 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0000_text_document +0.00125398260359913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0001_text_document +0.0012541704774729071 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0002_text_document +0.0012527268234360602 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0003_text_document +0.0012532925243737164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0004_text_document +0.0012456396241204315 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0005_text_document +0.0012589894424352072 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0006_text_document +0.001508020123999618 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0007_text_document +0.00333096950781965 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0008_text_document +0.0033233414614415547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0009_text_document +0.003512387990689828 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0010_text_document +0.0035091382940513126 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0011_text_document +0.003514155927147005 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0012_text_document +0.003327108000579638 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0013_text_document +0.003329106196589836 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0014_text_document +0.003505604148738077 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0015_text_document +0.003324825759567855 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0016_text_document +0.0033248240149804913 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0017_text_document +0.0033385962112851358 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0018_text_document +0.0035043186296553615 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0019_text_document +0.003340469505431529 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0020_text_document +0.0035106889084796276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0021_text_document +0.0033309469281030167 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0022_text_document +0.003340337858029757 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0023_text_document +0.003505919861097801 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0024_text_document +0.0003882924098240512 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/pes2o-0025_text_document +0.0005759963691850877 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0000_text_document +0.0005959971675332674 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0001_text_document +0.0006026179290353799 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0002_text_document +0.0005824184320784846 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0003_text_document +0.0005854598548616037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0004_text_document +0.0005903767055633473 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0005_text_document +0.0005930306490982049 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0006_text_document +0.000569425602700746 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0007_text_document +0.0005675060415179408 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0008_text_document +0.0005772431621253389 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0009_text_document +0.0005678026053826858 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0010_text_document +0.0005700398263483378 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0011_text_document +0.0005669467963528824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0012_text_document +0.0005701015953324305 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0013_text_document +0.0005795907287413296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0014_text_document +0.0005735602737531164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0015_text_document +0.0005749862745842101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0016_text_document +0.0005693257015931971 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0017_text_document +0.0005716568794795563 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0018_text_document +0.0005761083919774021 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0019_text_document +0.0005688343169797355 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0020_text_document +0.0005807913190929842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0021_text_document +0.0005710229258078636 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0022_text_document +0.0005704083039826862 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0023_text_document +0.0005862132348308056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0024_text_document +0.0005717662049559556 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0025_text_document +0.0005858155213694451 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0026_text_document +0.0005812012281792392 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0027_text_document +0.0005803981414588498 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0028_text_document +0.0005700102108287723 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0029_text_document +0.0005719243459052329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0030_text_document +0.0005867253401661752 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0031_text_document +0.0005731087218860733 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0032_text_document +0.0005712197789109317 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0033_text_document +0.0005702376926310089 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0034_text_document +0.0005700411527742972 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0035_text_document +0.0005828090098178196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0036_text_document +0.0005770140826168056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0037_text_document +0.0005723509664597896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0038_text_document +0.0005755499231836962 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0039_text_document +0.0005636407438471367 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0040_text_document +0.0005640281556500104 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0041_text_document +0.0005633159058766496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0042_text_document +0.0005638034311151449 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0043_text_document +0.0005630066273073224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0044_text_document +0.0005631803831128559 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0045_text_document +0.0005631228881679657 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0046_text_document +0.0005628178701487633 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0047_text_document +0.0005624448092256196 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0048_text_document +0.0005620957024062329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0049_text_document +0.0005614201504177484 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0050_text_document +0.0005616890951464056 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0051_text_document +0.0005611348559279058 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0052_text_document +0.0005604238061828518 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0053_text_document +0.0005603301490194237 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0054_text_document +0.0005607291294548833 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0055_text_document +0.0005605234569930727 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0056_text_document +0.0005613778566640694 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0057_text_document +0.0005610248539992471 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0058_text_document +0.0005599977416780475 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0059_text_document +0.0005603632562116935 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0060_text_document +0.0005599177479509897 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0061_text_document +0.0005595202318298379 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0062_text_document +0.0005600975633499175 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0063_text_document +0.0005614075491213365 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0064_text_document +0.000612563885043477 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0065_text_document +0.0005515469909644413 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0066_text_document +0.0005526782014946906 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0067_text_document +0.0005472463408095445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0068_text_document +0.0005502284746004587 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0069_text_document +0.0005414514790555363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0070_text_document +0.0005513499500134784 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0071_text_document +0.0005391391454105187 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0072_text_document +0.0005415836910001838 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0073_text_document +0.0005208132468536551 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0074_text_document +0.0005889827143132871 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0075_text_document +0.0005822520817765276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0076_text_document +0.0004173155230758696 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/reddit-0077_text_document +0.0009994361338078242 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0000_text_document +0.001087156194657966 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0001_text_document +0.0010667737163656816 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0002_text_document +0.0009602877882124873 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0003_text_document +0.0008968956271971105 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0004_text_document +0.0009198034843762967 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0005_text_document +0.0009423901016715341 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0006_text_document +0.0009674094553686345 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0007_text_document +0.0009858331322519164 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0008_text_document +0.0009970593645879198 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0009_text_document +0.0010027035193731686 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0010_text_document +0.0010128291154221853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0011_text_document +0.0010215631382631918 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0012_text_document +0.0010288663771461238 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0013_text_document +0.0010346219929285867 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0014_text_document +0.00104544019940344 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0015_text_document +0.0010525172676724333 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0016_text_document +0.0010609529620775127 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0017_text_document +0.0010725892748610153 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0018_text_document +0.0010818563598181568 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0019_text_document +0.0010992760196793917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0020_text_document +0.0011178992762079917 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0021_text_document +0.001124687532085676 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0022_text_document +0.001118303661267191 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0023_text_document +0.0010206825575416534 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0024_text_document +0.0005512280117499715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/stackexchange-0025_text_document +0.004474659408857016 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0000_text_document +0.00409944473890653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0001_text_document +0.005137179939941845 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0002_text_document +0.005143172251066109 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0003_text_document +0.005206134363352808 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0004_text_document +0.004892747858974329 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0005_text_document +0.004844731352552902 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0006_text_document +0.005308320169123755 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0007_text_document +0.005124709815666577 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0008_text_document +0.005424710744483826 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0009_text_document +0.00538244648861977 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0010_text_document +0.0029107284679086853 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0011_text_document +0.0026825258998444705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0012_text_document +0.0026904503191419243 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0013_text_document +0.002687906577174073 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0014_text_document +0.002850165346048818 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0015_text_document +0.005322698571717847 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0016_text_document +0.004450334290869719 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0017_text_document +0.004700990083440683 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0018_text_document +0.003903568556500995 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0019_text_document +0.00390561515396931 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0020_text_document +0.0039046402900912262 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0021_text_document +0.003907454839379547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0022_text_document +0.0038583224578603824 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0023_text_document +0.0037914116657695 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0024_text_document +0.003786665266798682 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0025_text_document +0.003792000802430658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0026_text_document +0.00319266847466091 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0027_text_document +0.0032658716699838944 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0028_text_document +0.0034801959532460023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0029_text_document +0.0028307012092022594 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0030_text_document +0.0028420360878146276 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0031_text_document +0.0028410455248484914 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0032_text_document +0.00283497183526842 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0033_text_document +0.002840187195459487 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0034_text_document +0.0028398709431369834 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0035_text_document +0.004364722843422023 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0036_text_document +0.004093255713117101 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0037_text_document +0.004092331079566252 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0038_text_document +0.004005326985579649 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0039_text_document +0.0036205502856964207 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0040_text_document +0.003625316793034984 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0041_text_document +0.003604743435602363 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0042_text_document +0.0035405823343673125 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0043_text_document +0.0041601413517253945 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0044_text_document +0.005886303658937057 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0045_text_document +0.003600909532810332 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0046_text_document +0.0034941365817168658 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0047_text_document +0.0004992164842980224 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/starcoder-0048_text_document +0.00032927705604725614 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0000_text_document +0.0002860154190878753 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0001_text_document +0.0002845217585425619 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0002_text_document +0.0002743528685497456 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0003_text_document +0.00026025323737738766 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0004_text_document +0.00023493876414603155 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0005_text_document +0.00029665994994226705 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0006_text_document +0.00031808102075993956 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0007_text_document +0.00031813573046011285 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0008_text_document +0.0002711905171855542 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0009_text_document +0.00028892513401817095 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0010_text_document +0.00030003908676979083 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0011_text_document +0.00026839878771944684 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0012_text_document +0.00029155935002690497 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0013_text_document +0.0002998624927624209 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0014_text_document +0.0003091705447974841 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0015_text_document +0.00026873195794309786 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0016_text_document +0.00027721873498527547 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0017_text_document +0.0002841662554024377 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0018_text_document +0.0002839461156551537 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0019_text_document +0.0002861705604659811 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0020_text_document +0.0002460995649635886 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0021_text_document +0.00019420142619795496 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0022_text_document +0.00021967677816173628 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0023_text_document +0.0002620283200480949 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0024_text_document +0.0002433390542188936 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0025_text_document +0.00021254976608350767 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0026_text_document +0.00022094815569522115 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0027_text_document +0.000342862378668244 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0028_text_document +0.00033784225259118157 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0029_text_document +0.0003367278459543952 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0030_text_document +0.00029843279042852765 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0031_text_document +0.0002926583661257988 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0032_text_document +0.00029320337282010673 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0033_text_document +0.00029281450669483455 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0034_text_document +0.0002915338187002653 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0035_text_document +0.0002864226923084572 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0036_text_document +0.00028643439083586396 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0037_text_document +0.00028253710956299054 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0038_text_document +0.0002810856078805806 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0039_text_document +0.00031474941344656715 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0040_text_document +0.0002139130222205655 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0041_text_document +0.0003084648871862831 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0042_text_document +0.0003309477872140129 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0043_text_document +0.0003360096824695161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0044_text_document +0.0003355452655196557 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0045_text_document +0.00038119390366386037 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0046_text_document +0.00038078927630086064 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0047_text_document +0.0003386200917551554 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0048_text_document +0.0002158905159938882 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0049_text_document +0.00021621682877018768 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0050_text_document +0.00021553306942740535 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0051_text_document +0.00021581563462722296 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0052_text_document +0.0002157694110556169 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0053_text_document +0.000215643699847159 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0054_text_document +0.00021532716715168094 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0055_text_document +0.00021531221326022472 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0056_text_document +0.0002831801179028896 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0057_text_document +0.0002514844936507595 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0058_text_document +0.00031638782778107964 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0059_text_document +0.0002749197545278445 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0060_text_document +0.00026159721512464495 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0061_text_document +0.0002630052420096968 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0062_text_document +0.00031106811228913666 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0063_text_document +0.0002852973415334161 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0064_text_document +3.7555372465932136e-05 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/tulu_flan-0065_text_document +0.003548077173506675 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0000_text_document +0.0018372203137874265 /gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_v1.7_Llama2Tokenizer/wiki-0001_text_document From f63aad14f230c60a0ad523019c09c34075326445 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Apr 2024 12:02:46 -0500 Subject: [PATCH 222/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 68 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 9b057e0c03..a0d191bb8c 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -306,13 +306,46 @@ function sumFiles() { done } +######################################################## +# Setup / activate conda environment, +# mine is called q4-drop +######################################################## +setup_conda_sunspot() { + if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$(~/miniconda3/bin/conda shell hook -s posix)" + conda activate q4-drop + else + echo "Found existing python at: $(which python3)" + fi +} + +setup_conda_sirius() { + if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then + export MAMBA_ROOT_PREFIX=/lus/tegu/projects/PolarisAT/foremans/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook --shell ${shell_name})" + micromamba activate 2024-04-23 + else + echo "Found existing python at: $(which python3)" + fi +} + +setup_conda_polaris() { + if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then + export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" + micromamba activate 2024-04-25 + else + echo "Found existing python at: $(which python3)" + fi +} + function setEnv() { # ---- [SunSpot] ------- || ---- [Aurora] -------------- if [[ $(hostname) == x1* || $(hostname) == x4* ]]; then - # PBS_PARENT=$(dirname ${PBS_O_WORKDIR}) - # echo "Sourcing ${PBS_PARENT}/setenv.sh..." - # source "${PBS_PARENT}/setenv.sh" || exit source "${WORKING_DIR}/ALCF/sunspot-env.sh" || exit # ----- [Aurora] ----------------------------------- if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then @@ -321,20 +354,22 @@ function setEnv() { # ----- [SunSpot] ---------------------------------- elif [[ $(hostname) == x1* ]]; then echo "Running on SunSpot !!" - eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate q4-drop + setup_conda_sunspot + # eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate q4-drop fi fi # ----- [Polaris] --------------------------------------- elif [[ $(hostname) == x3* ]]; then if [[ "${PBS_O_HOST}" == sirius* ]]; then export MACHINE="Running on Sirius !!" + setup_conda_sirius else echo "Running on Polaris !!" # ---- [load conda] --------------------- - # module load conda/2023-10-04; conda activate cu118-pt221 ; unset PYTHONUSERBASE - if [[ -d "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221" ]]; then - source "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221/bin/activate" - fi + setup_conda_polaris + # if [[ -d "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221" ]]; then + # source "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221/bin/activate" + # fi fi elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then echo "Running on Perlmutter !!" @@ -353,8 +388,6 @@ function makeHostfiles() { printf "!! USING CUSTOM HOSTFILE FROM: %s" "${HOSTFILE}" else make_ds_hostfile - # source "${WORKING_DIR}/deps/ezpz/src/ezpz/bin/savejobenv" || exit #> /tmp/savejobenv.log 2>&1 & - # source "${WORKING_DIR}/deps/ezpz/src/ezpz/bin/getjobenv" || exit fi } @@ -362,24 +395,27 @@ function setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------ if [[ $(hostname) == x4* ]]; then # ---- [AURORA] ---- dfl_fallback="/home/foremans/anl_24_release_q4/llm.devkit/Megatron-DeepSpeed/data_file_list_reweighted.txt" elif [[ $(hostname) == x1* ]]; then - dfl_fallback="${WORKING_DIR}/ALCF/data-lists/sunspot/data_file_list_books.txt" - # dfl_fallback="/gila/Aurora_deployment/AuroraGPT/datasets/dolma/data_file_list_reweighted.txt" + # shellcheck: source ./data-lists/sunspot/books.txt + dfl_fallback="${WORKING_DIR}/ALCF/data-lists/sunspot/books.txt" elif [[ $(hostname) == x3* ]]; then - # dfl_fallback="/eagle/datasets/dolma/data_file_list_reweighted.txt" - dfl_fallback="${WORKING_DIR}/ALCF/data-lists/polaris/data_file_list_books.txt" + if [[ "${PBS_O_HOST}" == sirius* ]]; then + # shellcheck: source ./data-lists/sirius/books.txt + dfl_fallback="${WORKING_DIR}/ALCF/data-lists/sirius/books.txt" + elif [[ "${PBS_O_HOST}" == polaris* ]]; then + # shellcheck: source ./data-lists/polaris/books.txt + dfl_fallback="${WORKING_DIR}/ALCF/data-lists/polaris/books.txt" + fi elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then dfl_fallback="${SLURM_SUBMIT_DIR}/genslm-subsample.txt" else echo "Unknown hostname. Must manually specify DATA_FILE_LIST." fi dfl="${1:-${dfl_fallback}}" - # dfl_fallback="/eagle/datasets/dolma/data_file_list_reweighted.txt" printf "Calling: setData() with %s\n" "${dfl}" ndocs=$(wc -l < "${dfl}") ws=$(sumWeights "${dfl}") dfl_stem=$(echo "${dfl}" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") dcp=".cache/${dfl_stem}/index-cache" - # mkdir -p dcp export DATA_FILE_LIST="${dfl}" export NUM_DOCS="${ndocs}" export WEIGHT_SUM="${ws}" From d329801a8e48a6da10b0a4d2da59961d293691dd Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Apr 2024 12:02:54 -0500 Subject: [PATCH 223/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index a0d191bb8c..868f0e194b 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -361,7 +361,7 @@ function setEnv() { # ----- [Polaris] --------------------------------------- elif [[ $(hostname) == x3* ]]; then if [[ "${PBS_O_HOST}" == sirius* ]]; then - export MACHINE="Running on Sirius !!" + echo "Running on Sirius !!" setup_conda_sirius else echo "Running on Polaris !!" From 585c15e3557157df2f31e1d5d3ccbf343ca6088c Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Apr 2024 12:03:08 -0500 Subject: [PATCH 224/268] Update `train_llama_alcf.sh` --- train_llama_alcf.sh | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index cd2d8213dd..462da0c4e1 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -35,39 +35,38 @@ sourceFile "${HERE}/ALCF/helpers.sh" || exit # ----[3. Call fns from `./ALCF/helpers_alcf.sh`]------------------------------ setEnv || exit # 1. load `conda` environment -# saveDSenv || exit # 2. save env vars to `.deepspeed_env` +# saveDSenv || exit # 2. save env vars to `.deepspeed_env` ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars - -# if [[ -z "${HOSTFILE}" ]]; then -# makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` -# else -# echo "!! USING CUSTOM HOSTFILE FROM: ${HOSTFILE}" -# fi setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} setArgs || exit # 8. specify additional `deepspeed` arguments setData "${DATA_FILE_LIST}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset -# setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` printJobInfo || exit # 11. print job info setupLauncher || exit # ----------------------------------------------------------------------------- - +#### [DEPRECATED] ########################################################### +# if [[ -z "${HOSTFILE}" ]]; then +# makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` +# else +# echo "!! USING CUSTOM HOSTFILE FROM: ${HOSTFILE}" +# fi +# ---------------------------------------------------------------------------- +# setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` +# ---------------------------------------------------------------------------- # TORCH_DEVICE=$(python3 -c 'import ezpz as ez; print(ez.get_torch_device())') # printf %s "Using TORCH_DEVICE=${TORCH_DEVICE}" -# # if [[ "${TORCH_DEVICE}" == "cuda" ]]; then # printf %s "Setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True" # PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True # fi - - +# ---------------------------------------------------------------------------- # export MPICH_GPU_SUPPORT_ENABLED=1 # export CUDA_DEVICE_MAX_CONNECTIONS=1 # export NCCL_DEBUG=INFO -# -# +############################################################################# + # Assert TBDIR exists inside our $CKPT_DIR TBDIR="${CKPT_DIR}/tensorboard" mkdir -p "${TBDIR}" @@ -130,12 +129,7 @@ run_cmd=" |& tee ${OUTPUT_LOG} " -# ds_exec -# echo "! Using $(which deepspeed)" -# ds_report - echo "${run_cmd}" - printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" eval "${run_cmd}" From 3444b99dd8639659a4c5ed987952df093d09d46a Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Apr 2024 12:03:25 -0500 Subject: [PATCH 225/268] Update `.gitignore` --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index edbde3c246..3b18d4956a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ # User Added +deps/* +OUTPUTS/* +ALCF/OUTPUTS/* *tmp* *core.* *old* From 505aef062892ef1da3e1124e7bab8c383dc54fe9 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Apr 2024 12:15:03 -0500 Subject: [PATCH 226/268] Update defaults in `ALCF/helpers.sh` --- ALCF/helpers.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 868f0e194b..7fa8e748c1 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -113,12 +113,13 @@ function setParams() { ####################################################### # +--------[Polaris]-----------------------------------+ elif [[ $(hostname) == x3* ]]; then - TP=${TP:-2} # TP = 2 + TP=${TP:-1} # TP = 2 export NCCL=${NCCL:-nccl} # NCCL export BE="${NCCL}" # BE = NCCL # export DTYPE=${DTYPE:-bf16} # DTYPE: BF16 ?? export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 - MICRO_BATCH=${MICRO_BATCH:-8} # MICRO_BATCH = 8 + export GRAD_ACC_STEPS=4 # GRADIENT_ACCUMULATION_STEPS + MICRO_BATCH=${MICRO_BATCH:-2} # MICRO_BATCH = 8 if [[ -n "${NO_FLASH_ATTN-}" ]]; then echo "Not using flash-attn!!" else @@ -160,7 +161,7 @@ function setParams() { # +---[Run Settings]------------------------------------------------------+ export LR=${LR:-0.0003} # LEARNING_RATE export SEQ=${SEQ:-4096} # SEQ_LEN: 4096 - export ZERO_STAGE=${ZERO_STAGE:-2} # ZERO OFFLOADING STAGE + export ZERO_STAGE=${ZERO_STAGE:-1} # ZERO OFFLOADING STAGE export MICRO_BATCH=${MICRO_BATCH:-8} # MICRO BATCH SIZE export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} # GRADIENT ACCUMULATION STEPS export EVAL_ITERS="${EVAL_ITERS:-10}" # NUMBER OF EVAL ITERS TO RUN From e31bb23b9c7e865b873c9d888439ba81018ba387 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Apr 2024 12:15:24 -0500 Subject: [PATCH 227/268] Add `train_agpt.sh` --- train_agpt.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 train_agpt.sh diff --git a/train_agpt.sh b/train_agpt.sh new file mode 100644 index 0000000000..4129f553e2 --- /dev/null +++ b/train_agpt.sh @@ -0,0 +1,11 @@ +#!/bin/bash --login + +NOW="$(date "+%Y-%m-%d-%H%M%S")" +cd "${PBS_O_WORKDIR}" || exit + +OUTDIR="${PBS_O_WORKDIR}/pbslogs" +mkdir -p "${OUTDIR}" +OUTFILE="${OUTDIR}/${PBS_JOBID}-${NOW}.log" +echo "${OUTFILE}" >> "${OUTDIR}/latest" +echo "Logging job output to: ${OUTFILE}" +bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" From a73e9af36ef12ae8435ed72bc15bd78e58bdc670 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Apr 2024 13:58:31 -0500 Subject: [PATCH 228/268] Update `ALCF/test_{polaris,sirius,sunspot}.sh` --- ALCF/test_polaris.sh | 4 ++-- ALCF/test_sirius.sh | 2 +- ALCF/test_sunspot.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ALCF/test_polaris.sh b/ALCF/test_polaris.sh index 3a6734f61e..a18c87fad7 100644 --- a/ALCF/test_polaris.sh +++ b/ALCF/test_polaris.sh @@ -21,7 +21,7 @@ NOW="$(date "+%Y-%m-%d-%H%M%S")" # mine is called q4-drop ######################################################## setup_conda() { - if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" @@ -54,7 +54,7 @@ setup_megatron_deepspeed() { main() { local virtual_env="${VIRTUAL_ENV-}" - local conda_prefix="${CONDA_PREFIX}" + local conda_prefix="${CONDA_PREFIX-}" if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then echo "Using conda from: ${conda_prefix}" elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then diff --git a/ALCF/test_sirius.sh b/ALCF/test_sirius.sh index 39170fa47c..0a528a9519 100755 --- a/ALCF/test_sirius.sh +++ b/ALCF/test_sirius.sh @@ -54,7 +54,7 @@ setup_megatron_deepspeed() { main() { local virtual_env="${VIRTUAL_ENV-}" - local conda_prefix="${CONDA_PREFIX}" + local conda_prefix="${CONDA_PREFIX-}" if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then echo "Using conda from: ${conda_prefix}" elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then diff --git a/ALCF/test_sunspot.sh b/ALCF/test_sunspot.sh index b39bff7736..b3b22c78b4 100755 --- a/ALCF/test_sunspot.sh +++ b/ALCF/test_sunspot.sh @@ -53,7 +53,7 @@ setup_megatron_deepspeed() { main() { local virtual_env="${VIRTUAL_ENV-}" - local conda_prefix="${CONDA_PREFIX}" + local conda_prefix="${CONDA_PREFIX-}" if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then echo "Using conda from: ${conda_prefix}" elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then From 57f1c964052074e8b75d04ec1a0d4242e3ef1678 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Apr 2024 13:58:47 -0500 Subject: [PATCH 229/268] Add `ALCF/test_alcf.sh` --- ALCF/test_alcf.sh | 166 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 ALCF/test_alcf.sh diff --git a/ALCF/test_alcf.sh b/ALCF/test_alcf.sh new file mode 100644 index 0000000000..367b40d17c --- /dev/null +++ b/ALCF/test_alcf.sh @@ -0,0 +1,166 @@ +#!/bin/bash --login +# +# Run complete test of +# https://github.com/argonne-lcf/Megatron-DeepSpeed +# on {Polaris, Sunspot, Sirius} @ ALCF +# to launch (inside an interactive `qsub -I` job) on Polaris: +# +# ```bash` +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed/ALCF +# $ bash test_alcf.sh +# ```` + +# EXIT ON ERROR(s) +set -euxo pipefail + +NOW="$(date "+%Y-%m-%d-%H%M%S")" + +setup_conda_sunspot() { + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$(~/miniconda3/bin/conda shell hook -s posix)" + conda activate q4-drop + else + echo "Found existing python at: $(which python3)" + fi +} + +setup_conda_sirius() { + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then + export MAMBA_ROOT_PREFIX=/lus/tegu/projects/PolarisAT/foremans/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook --shell ${shell_name})" + micromamba activate 2024-04-23 + else + echo "Found existing python at: $(which python3)" + fi +} + +setup_conda_polaris() { + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then + # export CUDA_HOME=/soft/compilers/cudatoolkit/cuda-12.2.2 + # && export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba && eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" ; mm activate 2024-04-25 + export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba + shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" + micromamba activate 2024-04-25 + else + echo "Found existing python at: $(which python3)" + fi +} + + +function setEnv() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + # setup_conda + # ---- [SunSpot] ------- || ---- [Aurora] -------------- + if [[ $(hostname) == x1* || $(hostname) == x4* ]]; then + source "${WORKING_DIR}/ALCF/sunspot-env.sh" || exit + # ----- [Aurora] ----------------------------------- + if [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + if [[ $(hostname) == x4* ]]; then + eval "$(conda shell.zsh hook)" && conda activate anl_release_q4v2 + # ----- [SunSpot] ---------------------------------- + elif [[ $(hostname) == x1* ]]; then + echo "Running on SunSpot !!" + setup_conda_sunspot + # eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate q4-drop + fi + fi + # ----- [Polaris] --------------------------------------- + elif [[ $(hostname) == x3* ]]; then + if [[ "${PBS_O_HOST}" == sirius* ]]; then + echo "Running on Sirius !!" + setup_conda_sirius + else + echo "Running on Polaris !!" + # ---- [load conda] --------------------- + setup_conda_polaris + # if [[ -d "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221" ]]; then + # source "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221/bin/activate" + # fi + fi + elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then + echo "Running on Perlmutter !!" + module load pytorch + source "${SLURM_SUBMIT_DIR}/venvs/perlmutter/pytorch-2.1.0-cu12/bin/activate" + else # ------------------------------------- [Unknown] ------------------- + echo "Unknown hostname $(hostname)" + exit 1 + fi + else + echo "Unable to setup python environment. Exiting" + exit 1 + fi + echo "[python] Using: $(which python3)" +} + + + +######################################## +# Make sure ./tmp/Megatron-DeepSpeed +# does not already exist +######################################## +setup_megatron_deepspeed() { + OUTDIR="OUTPUTS/test-polaris-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + echo "Running test in: ${OUTDIR}" + echo "WORKING DIRECTORY: $(realpath $(pwd .))" + if [[ -d "Megatron-DeepSpeed" ]]; then + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." + exit + fi + git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed + if [[ -n "${GIT_BRANCH-}" ]]; then + git checkout "${GIT_BRANCH}" + fi +} + + +main() { + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + setup_conda + else + echo "Unable to setup python. Exiting" + exit 1 + fi + setup_megatron_deepspeed + export DEBUG=1 + export PBS_O_WORKDIR="$(pwd)" + SUBMITTED_FROM=$(echo $PBS_O_HOST | tr '-' ' ' | awk '{print $1}') + export DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/${SUBMITTED_FROM}/books.txt" + if [[ ! -f "${DATA_FILE_LIST}" ]]; then + echo "Unable to find / use ${DATA_FILE_LIST}. Exiting." + exit 1 + fi + export ZERO_STAGE=1 + export NUM_LAYERS=10 + export MICRO_BATCH=8 + export TRAIN_ITER=20 + export TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee "test-${SUBMITTED_FROM}-${NOW}".log +} + +main + From 455126caa6a4bf21c2ab87eaee53b1ac7404acde Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Apr 2024 13:59:04 -0500 Subject: [PATCH 230/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 162 ++++++++++++++++++++++++++++-------------------- 1 file changed, 95 insertions(+), 67 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 7fa8e748c1..516bc59f4c 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -16,6 +16,17 @@ export WORKING_DIR="${WORKING_DIR}" printf "Using WORKING_DIR: %s\n" ${WORKING_DIR} +function check_and_kill_if_running() { + # kill $(ps aux | grep -E "$USER.+(mpi|main.py)" | grep -v grep | awk '{print $2}') + RUNNING_PIDS=$(lsof -i:29500 -Fp | head -n 1 | sed 's/^p//') + if [[ -n "${RUNNING_PIDS}" ]]; + then echo "Caught ${RUNNING_PIDS}" && kill "${RUNNING_PIDS}"; + else + echo "Not currently running. Continuing!" + fi +} + + function setupSrun() { if [[ $(hostname) == login* || $(hostname) == nid* ]]; then export NHOSTS="${SLURM_NNODES:-1}" @@ -100,26 +111,27 @@ function setParams() { if [[ $(hostname) == x4* || $(hostname) == x1* ]]; then TP=${TP:-1} # TP = 1 export CCL=${CCL:-ccl} # CCL - export BE="${CCL}" # BE = CCL + export BE="${CCL}" # COMMUNICATION BACKEND = CCL export DTYPE=${DTYPE:-bf16} # DTYPE: bf16 MICRO_BATCH=${MICRO_BATCH:-4} # MICRO_BATCH = 4 - ####################################################### - # if NO_FLASH_ATTN is NON-empty; then NO FLASH ATTN !! + ############################################################## + # NOTE: if NO_FLASH_ATTN is NON-empty; then NO FLASH ATTN !! if [[ -n "${NO_FLASH_ATTN-}" ]]; then echo "Not using flash-attn!!" else LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn-builder" fi - ####################################################### + ############################################################## # +--------[Polaris]-----------------------------------+ elif [[ $(hostname) == x3* ]]; then - TP=${TP:-1} # TP = 2 - export NCCL=${NCCL:-nccl} # NCCL - export BE="${NCCL}" # BE = NCCL - # export DTYPE=${DTYPE:-bf16} # DTYPE: BF16 ?? - export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 - export GRAD_ACC_STEPS=4 # GRADIENT_ACCUMULATION_STEPS - MICRO_BATCH=${MICRO_BATCH:-2} # MICRO_BATCH = 8 + TP=${TP:-1} # TP = 2 + export NCCL=${NCCL:-nccl} # NCCL + export BE="${NCCL}" # BE = NCCL + # export DTYPE=${DTYPE:-bf16} # DTYPE: BF16 ?? + export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 + export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-16} # GRADIENT_ACC_STEPS + # NOTE: MICRO_BATCH is exported below + MICRO_BATCH=${MICRO_BATCH:-1} # MICRO_BATCH = 8 if [[ -n "${NO_FLASH_ATTN-}" ]]; then echo "Not using flash-attn!!" else @@ -142,42 +154,40 @@ function setParams() { export TP="${TP}" export PP="${PP:-1}" export DTYPE="${DTYPE:-bf16}" - export OPT="${OPT:-adamw}" + export OPT="${OPT:-adamwschedulefree}" export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" NHOSTS=$(wc -l < "${HOSTFILE}") if [[ -z "${NGPU_PER_HOST-}" ]]; then NGPU_PER_HOST=$(python3 -c 'import ezpz as ez; print(ez.get_gpus_per_node())') fi export WORLD_SIZE="${WORLD_SIZE:-$(( NHOSTS * NGPU_PER_HOST ))}" - # export WORLD_SIZE="${WORLD_SIZE:-${NGPUS:-$(( ))}}" - # export WORLD_SIZE=${WORLD_SIZE:-$(wc -l < "${HOSTFILE}")} - # +---[Llama2 7B Config]-----------------------------+ + # +---[Llama2 7B Config]--------------------------------------------------+ export MODEL_KEY="Llama-7B" - export HEADS=${HEADS:-${NHEADS:-32}} - export NLAYERS=${NLAYERS:-${NUM_LAYERS:-32}} - export HIDDEN=${HIDDEN:-4096} - export NUM_KV_HEAD=${NUM_KV_HEAD:-8} - export FFN_HIDDEN_SIZE=${FFN_HIDDEN_SIZE:-11008} + export HEADS=${HEADS:-${NHEADS:-32}} # NUMBER OF ATEN HEADS + export NLAYERS=${NLAYERS:-${NUM_LAYERS:-32}} # NUMBER OF LAYERS + export HIDDEN=${HIDDEN:-4096} # HIDDEN SIZE + export NUM_KV_HEAD=${NUM_KV_HEAD:-8} # GROUP ATTENTION + export FFN_HIDDEN_SIZE=${FFN_HIDDEN_SIZE:-11008} # FFN HIDDEN SIZE # +---[Run Settings]------------------------------------------------------+ - export LR=${LR:-0.0003} # LEARNING_RATE - export SEQ=${SEQ:-4096} # SEQ_LEN: 4096 - export ZERO_STAGE=${ZERO_STAGE:-1} # ZERO OFFLOADING STAGE - export MICRO_BATCH=${MICRO_BATCH:-8} # MICRO BATCH SIZE - export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} # GRADIENT ACCUMULATION STEPS - export EVAL_ITERS="${EVAL_ITERS:-10}" # NUMBER OF EVAL ITERS TO RUN - export TRAIN_ITER=${TRAIN_ITER:-317892} # NUMBER OF TRAIN ITERS - export EVAL_INTERVAL="${EVAL_INTERVAL:-50000}" # HOW FREQUENTLY TO RUN EVAL - export SAVE_INTERVAL=${SAVE_INTERVAL:-200} # HOW FREQUENTLY TO SAVE CKPTS - export TIMING_LOG_LEVEL="${TIMING_LOG_LEVEL:-1}" # TIMING VERBOSITY IN LOGS + export LR=${LR:-0.0003} # LEARNING_RATE + export LR_WARMUP_FRAC=${LR_WARMUP_FRAC:-0.05} # LEARNING RATE WARMUP + export SEQ=${SEQ:-4096} # SEQ_LEN: 4096 + export ZERO_STAGE=${ZERO_STAGE:-1} # ZERO OFFLOADING STAGE + export MICRO_BATCH=${MICRO_BATCH:-8} # MICRO BATCH SIZE + export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} # GRADIENT ACCUMULATION STEPS + export EVAL_ITERS="${EVAL_ITERS:-10}" # NUMBER OF EVAL ITERS TO RUN + export TRAIN_ITER=${TRAIN_ITER:-317892} # NUMBER OF TRAIN ITERS + export EVAL_INTERVAL="${EVAL_INTERVAL:-50000}" # HOW FREQUENTLY TO RUN EVAL + export SAVE_INTERVAL=${SAVE_INTERVAL:-200} # HOW FREQUENTLY TO SAVE CKPTS + export TIMING_LOG_LEVEL="${TIMING_LOG_LEVEL:-1}" # TIMING VERBOSITY IN LOGS export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} # USE ACTIVATION CHECKPOINTING ? export GLOBAL_BATCH_MAX=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) # MAX GLOBAL BATCH SIZE export GLOBAL_BATCH="${GLOBAL_BATCH:-${GLOBAL_BATCH_MAX}}" # WILL USE MAX IF NOT SET IN ENVIRONMENT - tm="${WORKING_DIR}/ALCF/tokenizer.model" # fallback: Megatron-DeepSpeed/ALCF/tokenizer.model + tm="${WORKING_DIR}/ALCF/tokenizer.model" # fallback: Megatron-DeepSpeed/ALCF/tokenizer.model export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" # USE TOKENIZER_MODEL from env, else fallback from ^ export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" # STRING FOR IDENTIFYING MODEL # +----[ADDITIONAL LLAMA SPECIFIC ARGUMENTS]------------------------------ export LLAMA_ARGS="${LLAMA_ARGS} --no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" - # +----------------------------------------------------------------------+ } @@ -312,7 +322,7 @@ function sumFiles() { # mine is called q4-drop ######################################################## setup_conda_sunspot() { - if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') eval "$(~/miniconda3/bin/conda shell hook -s posix)" conda activate q4-drop @@ -322,7 +332,7 @@ setup_conda_sunspot() { } setup_conda_sirius() { - if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then export MAMBA_ROOT_PREFIX=/lus/tegu/projects/PolarisAT/foremans/micromamba shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook --shell ${shell_name})" @@ -333,7 +343,9 @@ setup_conda_sirius() { } setup_conda_polaris() { - if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then + if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then + # export CUDA_HOME=/soft/compilers/cudatoolkit/cuda-12.2.2 + # && export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba && eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" ; mm activate 2024-04-25 export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" @@ -345,39 +357,55 @@ setup_conda_polaris() { function setEnv() { - # ---- [SunSpot] ------- || ---- [Aurora] -------------- - if [[ $(hostname) == x1* || $(hostname) == x4* ]]; then - source "${WORKING_DIR}/ALCF/sunspot-env.sh" || exit - # ----- [Aurora] ----------------------------------- - if [[ -z "${CONDA_PREFIX}" && -z "${VIRTUAL_ENV}" ]]; then - if [[ $(hostname) == x4* ]]; then - eval "$(conda shell.zsh hook)" && conda activate anl_release_q4v2 - # ----- [SunSpot] ---------------------------------- - elif [[ $(hostname) == x1* ]]; then - echo "Running on SunSpot !!" - setup_conda_sunspot - # eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate q4-drop + local virtual_env="${VIRTUAL_ENV-}" + local conda_prefix="${CONDA_PREFIX-}" + if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "Using conda from: ${conda_prefix}" + elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "Using virtual_env from: ${virtual_env}" + elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then + echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No conda_prefix or virtual_env found in environment..." + echo "Setting up conda" + # setup_conda + # ---- [SunSpot] ------- || ---- [Aurora] -------------- + if [[ $(hostname) == x1* || $(hostname) == x4* ]]; then + source "${WORKING_DIR}/ALCF/sunspot-env.sh" || exit + # ----- [Aurora] ----------------------------------- + if [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then + if [[ $(hostname) == x4* ]]; then + eval "$(conda shell.zsh hook)" && conda activate anl_release_q4v2 + # ----- [SunSpot] ---------------------------------- + elif [[ $(hostname) == x1* ]]; then + echo "Running on SunSpot !!" + setup_conda_sunspot + # eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate q4-drop + fi fi + # ----- [Polaris] --------------------------------------- + elif [[ $(hostname) == x3* ]]; then + if [[ "${PBS_O_HOST}" == sirius* ]]; then + echo "Running on Sirius !!" + setup_conda_sirius + else + echo "Running on Polaris !!" + # ---- [load conda] --------------------- + setup_conda_polaris + # if [[ -d "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221" ]]; then + # source "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221/bin/activate" + # fi + fi + elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then + echo "Running on Perlmutter !!" + module load pytorch + source "${SLURM_SUBMIT_DIR}/venvs/perlmutter/pytorch-2.1.0-cu12/bin/activate" + else # ------------------------------------- [Unknown] ------------------- + echo "Unknown hostname $(hostname)" + exit 1 fi - # ----- [Polaris] --------------------------------------- - elif [[ $(hostname) == x3* ]]; then - if [[ "${PBS_O_HOST}" == sirius* ]]; then - echo "Running on Sirius !!" - setup_conda_sirius - else - echo "Running on Polaris !!" - # ---- [load conda] --------------------- - setup_conda_polaris - # if [[ -d "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221" ]]; then - # source "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221/bin/activate" - # fi - fi - elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then - echo "Running on Perlmutter !!" - module load pytorch - source "${SLURM_SUBMIT_DIR}/venvs/perlmutter/pytorch-2.1.0-cu12/bin/activate" - else # ------------------------------------- [Unknown] ------------------- - echo "Unknown hostname $(hostname)" + else + echo "Unable to setup python environment. Exiting" exit 1 fi echo "[python] Using: $(which python3)" @@ -473,7 +501,7 @@ function generateDSconfig() { \"gradient_clipping\": 1.0, \"activation_checkpointing\": { \"partition_activations\": true, - \"contiguous_memory_optimization\": false + \"contiguous_memory_optimization\": true }, \"wall_clock_breakdown\": false," flops_profiler="\ From 2a49f6dcb55771fba1b8737060848a4f04ecdd2f Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Apr 2024 13:59:49 -0500 Subject: [PATCH 231/268] Update `train_agpt.sh` --- train_agpt.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/train_agpt.sh b/train_agpt.sh index 4129f553e2..a21b988073 100644 --- a/train_agpt.sh +++ b/train_agpt.sh @@ -8,4 +8,5 @@ mkdir -p "${OUTDIR}" OUTFILE="${OUTDIR}/${PBS_JOBID}-${NOW}.log" echo "${OUTFILE}" >> "${OUTDIR}/latest" echo "Logging job output to: ${OUTFILE}" +# export DEBUG=1 bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" From 482dffd1b27fa85f8f8b2365e65151624c078e95 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 26 Apr 2024 13:59:57 -0500 Subject: [PATCH 232/268] Update `train_llama_alcf.sh` --- train_llama_alcf.sh | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index 462da0c4e1..5d728068a5 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -41,7 +41,7 @@ setParams || exit # 5. set command line arguments to pass to ` buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} setArgs || exit # 8. specify additional `deepspeed` arguments -setData "${DATA_FILE_LIST}"|| exit # 9. specify `DATA_FILE_LIST` for dolma dataset +setData "${DATA_FILE_LIST-}" || exit # 9. specify `DATA_FILE_LIST` for dolma dataset printJobInfo || exit # 11. print job info setupLauncher || exit # ----------------------------------------------------------------------------- @@ -75,6 +75,17 @@ data_cache_path="${CKPT_DIR}/${DATA_CACHE_PATH}" mkdir -p "${data_cache_path}" module list +if [[ "${TIMING_LOG_LEVEL}" -ge 1 ]]; then + TIMING_STR="\ + --timing-log-level ${TIMING_LOG_LEVEL} \ + --log-timers-to-tensorboard \ + --log-optimizer-states-to-tensorboard \ + " +else + TIMING_STR="" +fi + + # Take custom args custom_args=" $@" @@ -83,7 +94,6 @@ custom_args=" $@" run_cmd=" ${LAUNCH_CMD} \ --${DTYPE} \ - --optimizer ${OPT} \ --split 100,0,0 \ --log-interval 1 \ --no-bias-gelu-fusion \ @@ -94,16 +104,17 @@ run_cmd=" --no-gradient-accumulation-fusion \ --accumulate-allreduce-grads-in-fp32 \ --use-checkpoint-opt_param-scheduler \ - --tensorboard-dir ${TBDIR} \ --log-timers-to-tensorboard \ --log-optimizer-states-to-tensorboard \ --lr ${LR} \ + --optimizer ${OPT} \ --save ${CKPT_DIR} \ --load ${CKPT_DIR} \ --seq-length ${SEQ} \ --num-layers ${NLAYERS} \ --hidden-size ${HIDDEN} \ --train-iters ${TRAIN_ITER} \ + --tensorboard-dir ${TBDIR} \ --eval-iters ${EVAL_ITERS} \ --distributed-backend ${BE} \ --num-attention-heads ${HEADS} \ @@ -119,16 +130,16 @@ run_cmd=" --data-cache-path ${data_cache_path} \ --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ --tokenizer-model ${TOKENIZER_MODEL} \ - --timing-log-level ${TIMING_LOG_LEVEL} \ - --log-timers-to-tensorboard \ - --log-optimizer-states-to-tensorboard \ + --lr-warmup-fraction ${LR_WARMUP_FRAC} \ ${LLAMA_ARGS} \ + ${TIMING_STR} \ $ds_args \ ${gpt_args[*]} \ $custom_args \ |& tee ${OUTPUT_LOG} " +check_and_kill_if_running || exit echo "${run_cmd}" printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" From c04c42da256eeb809a63b758a1f782514c243958 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sat, 27 Apr 2024 00:22:39 -0500 Subject: [PATCH 233/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 516bc59f4c..8976247b09 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -129,9 +129,9 @@ function setParams() { export BE="${NCCL}" # BE = NCCL # export DTYPE=${DTYPE:-bf16} # DTYPE: BF16 ?? export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 - export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-16} # GRADIENT_ACC_STEPS + export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-8} # GRADIENT_ACC_STEPS # NOTE: MICRO_BATCH is exported below - MICRO_BATCH=${MICRO_BATCH:-1} # MICRO_BATCH = 8 + MICRO_BATCH=${MICRO_BATCH:-2} # MICRO_BATCH = 8 if [[ -n "${NO_FLASH_ATTN-}" ]]; then echo "Not using flash-attn!!" else @@ -180,6 +180,7 @@ function setParams() { export EVAL_INTERVAL="${EVAL_INTERVAL:-50000}" # HOW FREQUENTLY TO RUN EVAL export SAVE_INTERVAL=${SAVE_INTERVAL:-200} # HOW FREQUENTLY TO SAVE CKPTS export TIMING_LOG_LEVEL="${TIMING_LOG_LEVEL:-1}" # TIMING VERBOSITY IN LOGS + export ACT_CKPT_NUM_LAYERS="${ACT_CKPT_NUM_LAYERS:-1}" # NUM LAYERS TO CHECKPOINT ACTIVATIONS export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} # USE ACTIVATION CHECKPOINTING ? export GLOBAL_BATCH_MAX=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) # MAX GLOBAL BATCH SIZE export GLOBAL_BATCH="${GLOBAL_BATCH:-${GLOBAL_BATCH_MAX}}" # WILL USE MAX IF NOT SET IN ENVIRONMENT @@ -215,7 +216,7 @@ function setArgs() { echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!" gpt_args+=( "--checkpoint-activations" - "--checkpoint-num-layers 1" + "--checkpoint-num-layers ${ACT_CKPT_NUM_LAYERS}" ) fi export gpt_args @@ -432,7 +433,7 @@ function setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------ dfl_fallback="${WORKING_DIR}/ALCF/data-lists/sirius/books.txt" elif [[ "${PBS_O_HOST}" == polaris* ]]; then # shellcheck: source ./data-lists/polaris/books.txt - dfl_fallback="${WORKING_DIR}/ALCF/data-lists/polaris/books.txt" + dfl_fallback="${WORKING_DIR}/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt" fi elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then dfl_fallback="${SLURM_SUBMIT_DIR}/genslm-subsample.txt" From 36fa52093fa09602bbce890169623c2ac50def6a Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 1 May 2024 12:56:41 -0500 Subject: [PATCH 234/268] Fix for `conda/2024-04-29` on Polaris --- ALCF/helpers.sh | 17 ++++++++++++----- pretrain_gpt_alcf.py | 5 ++++- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 8976247b09..0f7b1f3559 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -79,7 +79,7 @@ function loadCondaEnv() { function setupLauncher() { # outdir=$1 if [[ -n "${DIST_LAUNCH}" && ${LAUNCH_CMD:-"MPICH"} != "deepspeed" ]]; then - export LAUNCH_CMD="${DIST_LAUNCH} --cpu-bind depth -d 16 python3 -Wignore ${EXEC}" + export LAUNCH_CMD="${DIST_LAUNCH} --genvall --cpu-bind depth -d 16 $(which python3) -Wignore ${EXEC}" else # Assert `./hostfile_deepspeed` exists export hfds="${WORKING_DIR}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit @@ -124,6 +124,7 @@ function setParams() { ############################################################## # +--------[Polaris]-----------------------------------+ elif [[ $(hostname) == x3* ]]; then + # export LAUNCH_CMD="${LAUNCH_CMD:-deepspeed}" TP=${TP:-1} # TP = 2 export NCCL=${NCCL:-nccl} # NCCL export BE="${NCCL}" # BE = NCCL @@ -347,10 +348,16 @@ setup_conda_polaris() { if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then # export CUDA_HOME=/soft/compilers/cudatoolkit/cuda-12.2.2 # && export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba && eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" ; mm activate 2024-04-25 - export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba - shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') - eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" - micromamba activate 2024-04-25 + # export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba + # shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') + # eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" + # micromamba activate 2024-04-25 + module use /soft/modulefiles + module load conda/2024-04-29 ; conda activate base + # unset MPICH_GPU_SUPPORT_ENABLED + # if [[ -d "${WORKING_DIR}/venvs/polaris/2024-04-29" ]]; then + # source "${WORKING_DIR}/venvs/polaris/2024-04-29/bin/activate" + # fi else echo "Found existing python at: $(which python3)" fi diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index f83ef05fc8..2c521a400a 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -2,6 +2,7 @@ """Pretrain GPT""" +from mpi4py import MPI import os from rich import print import torch @@ -40,8 +41,10 @@ # ---- [SETUP COMMS] ------------------------ +# if str(os.environ.get('LAUNCH_CMD', 'mpich')).lower() == 'mpich': RANK = ez.setup_torch(backend="deepspeed") -# RANK = ez.get_rank() +# else: +# RANK = ez.get_rank() WORLD_SIZE = ez.get_world_size() LOCAL_RANK = ez.get_local_rank() DEVICE = ez.get_torch_device() From 3b83b36b1dac258b3ae40d6ba3a3fcfe41165b06 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 1 May 2024 12:57:54 -0500 Subject: [PATCH 235/268] Add `train_agpt_polaris_7B_production.sh` --- train_agpt_polaris_7B_production.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 train_agpt_polaris_7B_production.sh diff --git a/train_agpt_polaris_7B_production.sh b/train_agpt_polaris_7B_production.sh new file mode 100644 index 0000000000..8b45ddc30a --- /dev/null +++ b/train_agpt_polaris_7B_production.sh @@ -0,0 +1,13 @@ +#!/bin/bash --login + +NOW="$(date "+%Y-%m-%d-%H%M%S")" +cd "${PBS_O_WORKDIR}" || exit + +OUTDIR="${PBS_O_WORKDIR}/pbslogs" +mkdir -p "${OUTDIR}" +OUTFILE="${OUTDIR}/${PBS_JOBID}-${NOW}.log" +echo "${OUTFILE}" >> "${OUTDIR}/latest" +echo "Logging job output to: ${OUTFILE}" +export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=6000 +# export DEBUG=1 +MICRO_BATCH=2 DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt" bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" From a916a8dd724e232c2db46b5ceb90143469681774 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 1 May 2024 13:50:58 -0500 Subject: [PATCH 236/268] Update `ALCF/helpers.sh` on Sunspot --- ALCF/helpers.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 0f7b1f3559..d42888b56e 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -113,6 +113,7 @@ function setParams() { export CCL=${CCL:-ccl} # CCL export BE="${CCL}" # COMMUNICATION BACKEND = CCL export DTYPE=${DTYPE:-bf16} # DTYPE: bf16 + export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-8} # GRADIENT_ACC_STEPS MICRO_BATCH=${MICRO_BATCH:-4} # MICRO_BATCH = 4 ############################################################## # NOTE: if NO_FLASH_ATTN is NON-empty; then NO FLASH ATTN !! @@ -326,7 +327,7 @@ function sumFiles() { setup_conda_sunspot() { if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') - eval "$(~/miniconda3/bin/conda shell hook -s posix)" + eval "$(~/miniconda3/bin/conda shell.${shell_name} hook)" conda activate q4-drop else echo "Found existing python at: $(which python3)" From 5257721b69e40c635df1e42f08d04bd6f21cdf73 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Wed, 8 May 2024 08:27:09 -0500 Subject: [PATCH 237/268] Update `ALCF/test_alcf.sh` --- ALCF/test_alcf.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ALCF/test_alcf.sh b/ALCF/test_alcf.sh index 367b40d17c..853addc59d 100644 --- a/ALCF/test_alcf.sh +++ b/ALCF/test_alcf.sh @@ -154,9 +154,9 @@ main() { echo "Unable to find / use ${DATA_FILE_LIST}. Exiting." exit 1 fi - export ZERO_STAGE=1 - export NUM_LAYERS=10 - export MICRO_BATCH=8 + # export ZERO_STAGE=1 + # export NUM_LAYERS=10 + # export MICRO_BATCH=8 export TRAIN_ITER=20 export TIMING_LOG_LEVEL=1 bash train_llama_alcf.sh |& tee "test-${SUBMITTED_FROM}-${NOW}".log From 1ad039c4d76bd661ddd4fddf59d14eb1fffce0b1 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 9 May 2024 12:45:49 -0500 Subject: [PATCH 238/268] Add `train_agpt_polaris_7B_production_NCCL_OFI.sh` --- train_agpt_polaris_7B_production_NCCL_OFI.sh | 23 ++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 train_agpt_polaris_7B_production_NCCL_OFI.sh diff --git a/train_agpt_polaris_7B_production_NCCL_OFI.sh b/train_agpt_polaris_7B_production_NCCL_OFI.sh new file mode 100644 index 0000000000..fd787dc80f --- /dev/null +++ b/train_agpt_polaris_7B_production_NCCL_OFI.sh @@ -0,0 +1,23 @@ +#!/bin/bash --login + +NOW="$(date "+%Y-%m-%d-%H%M%S")" +cd "${PBS_O_WORKDIR}" || exit + +OUTDIR="${PBS_O_WORKDIR}/pbslogs" +mkdir -p "${OUTDIR}" +OUTFILE="${OUTDIR}/${PBS_JOBID}-${NOW}.log" +echo "${OUTFILE}" >> "${OUTDIR}/latest" +echo "Logging job output to: ${OUTFILE}" + +# export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_COLLNET_ENABLE=1 +export NCCL_NET="AWS Libfabric" +export LD_LIBRARY_PATH=/soft/libraries/aws-ofi-nccl/v1.9.1-aws/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/soft/libraries/hwloc/lib/:$LD_LIBRARY_PATH +export FI_CXI_DISABLE_HOST_REGISTER=1 +export FI_MR_CACHE_MONITOR=userfaultfd +export FI_CXI_DEFAULT_CQ_SIZE=131072 +# export DEBUG=1 +MICRO_BATCH=4 DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt" bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" +# MICRO_BATCH=1 DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/polaris/books.txt" bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" From 328dfda088ac98090627ac95e9abd9932f1d57a3 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 14 May 2024 20:43:19 -0500 Subject: [PATCH 239/268] Create `flash-attn-fix` branch --- ALCF/aws_ofi_nccl_plugin.sh | 20 ++++ ALCF/helpers.sh | 103 ++++++++++++------- pretrain_gpt_alcf.py | 2 +- train_agpt_polaris_7B_production.sh | 28 +++-- train_agpt_polaris_7B_production_NCCL_OFI.sh | 23 ----- 5 files changed, 111 insertions(+), 65 deletions(-) create mode 100644 ALCF/aws_ofi_nccl_plugin.sh delete mode 100644 train_agpt_polaris_7B_production_NCCL_OFI.sh diff --git a/ALCF/aws_ofi_nccl_plugin.sh b/ALCF/aws_ofi_nccl_plugin.sh new file mode 100644 index 0000000000..ffd1471cd3 --- /dev/null +++ b/ALCF/aws_ofi_nccl_plugin.sh @@ -0,0 +1,20 @@ +#!/bin/bash --login + +# AWS NCCL OFI Plugin settings below +export NCCL_CROSS_NIC=1 +export NCCL_COLLNET_ENABLE=1 +export NCCL_NET="AWS Libfabric" +export LD_LIBRARY_PATH=/soft/libraries/aws-ofi-nccl/v1.9.1-aws/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/soft/libraries/hwloc/lib/:$LD_LIBRARY_PATH +export FI_CXI_DISABLE_HOST_REGISTER=1 +export FI_MR_CACHE_MONITOR=userfaultfd +export FI_CXI_DEFAULT_CQ_SIZE=131072 +######################################################### +# WARNING: !!! +# - Currently, `export NCCL_NET_GDR_LEVEL=PHB` +# causes a hang on Polaris. +# so, we don't set it for the time being [2024-05-14]. +# - Seems to work on Perlmutter ??? +# +# export NCCL_NET_GDR_LEVEL=PHB +######################################################### diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index d42888b56e..a257c3fb3f 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -156,7 +156,7 @@ function setParams() { export TP="${TP}" export PP="${PP:-1}" export DTYPE="${DTYPE:-bf16}" - export OPT="${OPT:-adamwschedulefree}" + export OPT="${OPT:-adamw}" export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}" NHOSTS=$(wc -l < "${HOSTFILE}") if [[ -z "${NGPU_PER_HOST-}" ]]; then @@ -322,9 +322,8 @@ function sumFiles() { ######################################################## # Setup / activate conda environment, -# mine is called q4-drop ######################################################## -setup_conda_sunspot() { +setup_conda_sunspot() { # mine is called `q4-drop` on Sunspot if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') eval "$(~/miniconda3/bin/conda shell.${shell_name} hook)" @@ -334,6 +333,9 @@ setup_conda_sunspot() { fi } +######################## +# Setup conda on Sirius +######################## setup_conda_sirius() { if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then export MAMBA_ROOT_PREFIX=/lus/tegu/projects/PolarisAT/foremans/micromamba @@ -345,20 +347,27 @@ setup_conda_sirius() { fi } +######################## +# Setup conda on Polaris +######################## setup_conda_polaris() { - if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then - # export CUDA_HOME=/soft/compilers/cudatoolkit/cuda-12.2.2 - # && export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba && eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" ; mm activate 2024-04-25 - # export MAMBA_ROOT_PREFIX=/eagle/argonne_tpc/micromamba - # shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') - # eval "$("${MAMBA_ROOT_PREFIX}/bin/micromamba" shell hook -s posix)" - # micromamba activate 2024-04-25 + # unset MPICH_GPU_SUPPORT_ENABLED + ###### check if CONDA_PREFIX non-empty ################ + if [[ -z "${CONDA_PREFIX-}" ]]; then + # if so, load the default conda/2024-04-29 + # module and activate base environment module use /soft/modulefiles module load conda/2024-04-29 ; conda activate base - # unset MPICH_GPU_SUPPORT_ENABLED - # if [[ -d "${WORKING_DIR}/venvs/polaris/2024-04-29" ]]; then - # source "${WORKING_DIR}/venvs/polaris/2024-04-29/bin/activate" - # fi + else + echo "Caught CONDA_PREFIX=${CONDA_PREFIX}" + fi + ###### check if VIRTUAL_ENV non-empty ################# + if [[ -z "${VIRTUAL_ENV:-}" ]]; then + DEFAULT_VENV_PATH=${WORKING_DIR}/venvs/polaris/2024-04-29 + if [[ -d "${DEFAULT_VENV_PATH}" ]]; then + echo "Caught virtual env at ${DEFAULT_VENV_PATH}!" + source "${WORKING_DIR}/venvs/polaris/2024-04-29/bin/activate" + fi else echo "Found existing python at: $(which python3)" fi @@ -369,42 +378,42 @@ function setEnv() { local virtual_env="${VIRTUAL_ENV-}" local conda_prefix="${CONDA_PREFIX-}" if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then + echo "No virtual environment found." echo "Using conda from: ${conda_prefix}" elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then + echo "No conda found." echo "Using virtual_env from: ${virtual_env}" elif [[ -n "${virtual_env}" && -n "${conda_prefix}" ]]; then - echo "Using virtual_env: ${virtual_env} on top of CONDA: ${conda_prefix}" + echo "Using virtual_env: ${virtual_env} on top of conda from: ${conda_prefix}" elif [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then echo "No conda_prefix or virtual_env found in environment..." - echo "Setting up conda" - # setup_conda + echo "Setting up conda..." + ######################## setup_conda ############################ # ---- [SunSpot] ------- || ---- [Aurora] -------------- if [[ $(hostname) == x1* || $(hostname) == x4* ]]; then source "${WORKING_DIR}/ALCF/sunspot-env.sh" || exit - # ----- [Aurora] ----------------------------------- + # ----- [Aurora] -------------------------------------------- if [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then if [[ $(hostname) == x4* ]]; then eval "$(conda shell.zsh hook)" && conda activate anl_release_q4v2 - # ----- [SunSpot] ---------------------------------- + # ----- [SunSpot] --------------------------------------- elif [[ $(hostname) == x1* ]]; then echo "Running on SunSpot !!" setup_conda_sunspot - # eval "$(/home/foremans/miniconda3/bin/conda shell.zsh hook)" && conda activate q4-drop fi fi - # ----- [Polaris] --------------------------------------- + # ----- [Polaris] ----------------------------------------------- elif [[ $(hostname) == x3* ]]; then if [[ "${PBS_O_HOST}" == sirius* ]]; then echo "Running on Sirius !!" setup_conda_sirius else echo "Running on Polaris !!" - # ---- [load conda] --------------------- + # ---- [load conda] ------------------------------------- setup_conda_polaris - # if [[ -d "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221" ]]; then - # source "${PBS_O_WORKDIR}/venvs/polaris/cu118-pt221/bin/activate" - # fi fi + echo "Setting up AWS NCCL OFI Plugin on Polaris..." + source "${WORKING_DIR}/ALCF/aws_ofi_nccl_plugin.sh" || exit elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then echo "Running on Perlmutter !!" module load pytorch @@ -417,10 +426,19 @@ function setEnv() { echo "Unable to setup python environment. Exiting" exit 1 fi - echo "[python] Using: $(which python3)" + printf "\n" + pystr="Using: $(which python3)" + printf "[python] %s" "$(printMagenta ${pystr})" + printf "\n" } +###################################################################### +# `makeHostiles`: +# Detect if `HOSTFILE` set in active environment. +# - If so, use this. +# - Otherwise, make default HOSTFILEs from "${PBS_NODEFILE}" +###################################################################### function makeHostfiles() { if [[ -n "${HOSTFILE}" ]]; then printf "!! USING CUSTOM HOSTFILE FROM: %s" "${HOSTFILE}" @@ -429,25 +447,40 @@ function makeHostfiles() { fi } -function setData() { # ---- [dfl: abbrv. for DATA_FILE_LIST] ------------------------- - if [[ $(hostname) == x4* ]]; then # ---- [AURORA] ---- +############################################### +# `setData`: +# Ensure `DATA_FILE_LIST` is set, +# fallback to default values if necessary. +############################################### +function setData() { # ----------------------[dfl: abbrv. for DATA_FILE_LIST] + # dfldir="${WORKING_DIR}/ALCF/data-lists" + # =====[Set DATA_FILE_LIST_FALLBACK based on current machine]============== + if [[ $(hostname) == x4* ]]; then # ---------------------------[AURORA] dfl_fallback="/home/foremans/anl_24_release_q4/llm.devkit/Megatron-DeepSpeed/data_file_list_reweighted.txt" - elif [[ $(hostname) == x1* ]]; then + + elif [[ $(hostname) == x1* ]]; then # --------------------------[SUNSPOT] # shellcheck: source ./data-lists/sunspot/books.txt dfl_fallback="${WORKING_DIR}/ALCF/data-lists/sunspot/books.txt" - elif [[ $(hostname) == x3* ]]; then - if [[ "${PBS_O_HOST}" == sirius* ]]; then + + elif [[ $(hostname) == x3* ]]; then # -------------------[POLARIS / SIRIUS] + if [[ "${PBS_O_HOST}" == sirius* ]]; then # -------------------[SIRIUS] # shellcheck: source ./data-lists/sirius/books.txt dfl_fallback="${WORKING_DIR}/ALCF/data-lists/sirius/books.txt" - elif [[ "${PBS_O_HOST}" == polaris* ]]; then + + elif [[ "${PBS_O_HOST}" == polaris* ]]; then # ---------------[POLARIS] # shellcheck: source ./data-lists/polaris/books.txt dfl_fallback="${WORKING_DIR}/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt" fi - elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then + + elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then # [PERLMUTTER] dfl_fallback="${SLURM_SUBMIT_DIR}/genslm-subsample.txt" - else + + else # -----------------------------------------------------------[UNKNOWN] echo "Unknown hostname. Must manually specify DATA_FILE_LIST." fi + # ========================================================================== + # set `dfl` to `dfl_fallback` if not passed as an argument, + # use this data file list to call `setData` dfl="${1:-${dfl_fallback}}" printf "Calling: setData() with %s\n" "${dfl}" ndocs=$(wc -l < "${dfl}") @@ -516,7 +549,7 @@ function generateDSconfig() { flops_profiler="\ \"flops_profiler\": { \"enabled\": true, - \"profile_step\": 4, + \"profile_step\": 2, \"module_depth\": -1, \"top_modules\": 1, \"detailed\": true, diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 2c521a400a..750bd21e48 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -42,7 +42,7 @@ # ---- [SETUP COMMS] ------------------------ # if str(os.environ.get('LAUNCH_CMD', 'mpich')).lower() == 'mpich': -RANK = ez.setup_torch(backend="deepspeed") +RANK = ez.setup_torch(backend="deepspeed", timeout=7200) # else: # RANK = ez.get_rank() WORLD_SIZE = ez.get_world_size() diff --git a/train_agpt_polaris_7B_production.sh b/train_agpt_polaris_7B_production.sh index 8b45ddc30a..f83b6ebc29 100644 --- a/train_agpt_polaris_7B_production.sh +++ b/train_agpt_polaris_7B_production.sh @@ -1,13 +1,29 @@ #!/bin/bash --login +# +# This script can be submitted with `qsub` via: +# +# ```bash +# $ git clone https://github.com/argonee-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed +# $ qsub train_agpt_polaris_7B_production.sh +# ``` -NOW="$(date "+%Y-%m-%d-%H%M%S")" cd "${PBS_O_WORKDIR}" || exit -OUTDIR="${PBS_O_WORKDIR}/pbslogs" -mkdir -p "${OUTDIR}" +TODAY="$(date "+%Y-%m-%d")" +NOW="$(date "+%Y-%m-%d-%H%M%S")" +OUTDIR="${PBS_O_WORKDIR}/pbslogs/${TODAY}" OUTFILE="${OUTDIR}/${PBS_JOBID}-${NOW}.log" -echo "${OUTFILE}" >> "${OUTDIR}/latest" +mkdir -p $(dirname "${OUTFILE}") + +echo "${OUTFILE}" >> "$(dirname ${OUTDIR})/latest" echo "Logging job output to: ${OUTFILE}" -export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=6000 + # export DEBUG=1 -MICRO_BATCH=2 DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt" bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" +# export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=6000 + +# Path to the data file list: +DFL="${PBS_O_WORKDIR}/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt" + +# Launch: +MICRO_BATCH=2 DATA_FILE_LIST="${DFL}" bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" diff --git a/train_agpt_polaris_7B_production_NCCL_OFI.sh b/train_agpt_polaris_7B_production_NCCL_OFI.sh deleted file mode 100644 index fd787dc80f..0000000000 --- a/train_agpt_polaris_7B_production_NCCL_OFI.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash --login - -NOW="$(date "+%Y-%m-%d-%H%M%S")" -cd "${PBS_O_WORKDIR}" || exit - -OUTDIR="${PBS_O_WORKDIR}/pbslogs" -mkdir -p "${OUTDIR}" -OUTFILE="${OUTDIR}/${PBS_JOBID}-${NOW}.log" -echo "${OUTFILE}" >> "${OUTDIR}/latest" -echo "Logging job output to: ${OUTFILE}" - -# export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_COLLNET_ENABLE=1 -export NCCL_NET="AWS Libfabric" -export LD_LIBRARY_PATH=/soft/libraries/aws-ofi-nccl/v1.9.1-aws/lib:$LD_LIBRARY_PATH -export LD_LIBRARY_PATH=/soft/libraries/hwloc/lib/:$LD_LIBRARY_PATH -export FI_CXI_DISABLE_HOST_REGISTER=1 -export FI_MR_CACHE_MONITOR=userfaultfd -export FI_CXI_DEFAULT_CQ_SIZE=131072 -# export DEBUG=1 -MICRO_BATCH=4 DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/polaris/dolma_v1_7_file_list.txt" bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" -# MICRO_BATCH=1 DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/polaris/books.txt" bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" From bd8cb0917cf0ca4700f68a0b33ffc26fc20e4718 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 14 May 2024 20:44:13 -0500 Subject: [PATCH 240/268] Add + pass default `LR_DECAY_ITERS` --- ALCF/helpers.sh | 9 +++++++-- train_llama_alcf.sh | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index a257c3fb3f..e66d5ff500 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -173,6 +173,7 @@ function setParams() { # +---[Run Settings]------------------------------------------------------+ export LR=${LR:-0.0003} # LEARNING_RATE export LR_WARMUP_FRAC=${LR_WARMUP_FRAC:-0.05} # LEARNING RATE WARMUP + export LR_DECAY_ITERS=${LR_DECAY_ITERS:-320000} # LR DECAY ITERS export SEQ=${SEQ:-4096} # SEQ_LEN: 4096 export ZERO_STAGE=${ZERO_STAGE:-1} # ZERO OFFLOADING STAGE export MICRO_BATCH=${MICRO_BATCH:-8} # MICRO BATCH SIZE @@ -281,7 +282,10 @@ function saveDSenv() { function setOutput() { # ---- Specify output location -------------------------------- - export OUTPUT_PREFIX="ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_opt${OPT}" + export OUTPUT_PREFIX="ws${WORLD_SIZE}_ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_opt${OPT}" + if [[ -z "${NO_FLASH_ATTN:-}" ]]; then + export OUTPUT_PREFIX="${OUTPUT_PREFIX}_flash" + fi # OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%Y%m%d-%H%M%S)_${WORLD_SIZE}_${HOSTNAME}" export OUTPUT_DIR="${OUTPUT_DIR}" @@ -289,7 +293,8 @@ function setOutput() { export CKPT_DIR="checkpoints/${OUTPUT_PREFIX}" echo "${OUTPUT_LOG}" >> "logs/latest" mkdir -p "${OUTPUT_DIR}" - echo "!!!Please see logs at ${OUTPUT_DIR}" + printf "Please see logs at: %s\n" $(printGreen "${OUTPUT_DIR}") + printf "Checkpoints will be saved to: %s\n" $(printYellow "${CKPT_DIR}") } function buildDSconfig() { diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index 5d728068a5..3e0f9390cb 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -124,6 +124,7 @@ run_cmd=" --micro-batch-size ${MICRO_BATCH} \ --data-file-list ${DATA_FILE_LIST} \ --tensor-model-parallel-size ${TP} \ + --lr-decay-iters ${LR_DECAY_ITERS} \ --global-batch-size ${GLOBAL_BATCH} \ --pipeline-model-parallel-size ${PP} \ --num-key-value-heads ${NUM_KV_HEAD} \ From 59f60522b486738e443177296af43c5807f8a57b Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 14 May 2024 20:45:16 -0500 Subject: [PATCH 241/268] Add `aGPT_7B.sh` --- aGPT_7B.sh | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 aGPT_7B.sh diff --git a/aGPT_7B.sh b/aGPT_7B.sh new file mode 100644 index 0000000000..0c192b1b4f --- /dev/null +++ b/aGPT_7B.sh @@ -0,0 +1,34 @@ +#!/bin/bash --login +# +HOSTNAME=$(hostname) +if [[ "${HOSTNAME}" == x3* ]]; then + MACHINE="polaris" +elif [[ "${HOSTNAME}" == x1* ]]; then + MACHINE="sunspot" +elif [[ "${HOSTNAME}" == x4* ]]; then + MACHINE="aurora" +fi + +NOW="$(date "+%Y-%m-%d-%H%M%S")" +cd "${PBS_O_WORKDIR}" || exit +nhosts=$(wc -l < "${HOSTFILE}") + +if [[ "${nhosts}" == 1 || "${nhosts}" == 2 ]]; then + MBS=1 +elif [[ "${nhosts}" -ge 2 ]]; then + MBS=2 +elif [[ "${nhosts}" -ge 8 ]]; then + MBS=4 +fi + +printf "Detected %s hosts. Running with micro_batch:\n" ${nhosts} ${MBS} + +OUTDIR="${PBS_O_WORKDIR}/pbslogs" +mkdir -p "${OUTDIR}" +OUTFILE="${OUTDIR}/${PBS_JOBID}-${NOW}.log" +echo "Running on: ${MACHINE}" +echo "${OUTFILE}" >> "${OUTDIR}/latest" +echo "Logging job output to: ${OUTFILE}" +export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=6000 +# export DEBUG=1 +MICRO_BATCH="${MBS}" DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/${MACHINE}/dolma_v1_7_file_list.txt" bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" From 9f98c0912437f2abbd05c663e54c5329210c7d91 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 14 May 2024 20:46:21 -0500 Subject: [PATCH 242/268] Update `.gitignore` --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 3b18d4956a..0a1387f56a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ # User Added +**.e1** +**.o1** deps/* OUTPUTS/* ALCF/OUTPUTS/* From 2cc2965c4831cda9621ce4781b3b24f478faa68c Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Tue, 14 May 2024 20:47:54 -0500 Subject: [PATCH 243/268] Rename `aGPT_7B.sh` -> `train_aGPT_7B.sh` --- aGPT_7B.sh => train_aGPT_7B.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename aGPT_7B.sh => train_aGPT_7B.sh (100%) diff --git a/aGPT_7B.sh b/train_aGPT_7B.sh similarity index 100% rename from aGPT_7B.sh rename to train_aGPT_7B.sh From 05e8af3ad75b969f05c1a294ac7ccedfdf30a171 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 16 May 2024 10:43:50 -0500 Subject: [PATCH 244/268] Add `ALCF/aws_ofi_nccl_plugin.sh` for Polaris --- ALCF/helpers.sh | 44 +++++++++++++++++++++++++++++++++----------- train_aGPT_7B.sh | 25 ++++++++++++++++--------- train_llama_alcf.sh | 21 +++++++++++---------- 3 files changed, 60 insertions(+), 30 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index e66d5ff500..1bd0febf24 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -16,6 +16,27 @@ export WORKING_DIR="${WORKING_DIR}" printf "Using WORKING_DIR: %s\n" ${WORKING_DIR} +function get_machine() { + if [[ $(hostname) == x4* ]]; then + machine="aurora" + elif [[ $(hostname) == x1* ]]; then + machine="sunspot" + elif [[ $(hostname) == x3* ]]; then + if [[ "${PBS_O_HOST}" == sirius* ]]; then + machine="sirius" + else + machine="polaris" + fi + elif [[ $(hostname) == nid* ]]; then + machine="perlmutter" + else + echo "Unknown MACHINE. Setting MACHINE to $(hostname) and continuing..." + fi + export MACHINE="${machine}" + printf "Running on: %s\n" "$(printBlue ${MACHINE})" +} + + function check_and_kill_if_running() { # kill $(ps aux | grep -E "$USER.+(mpi|main.py)" | grep -v grep | awk '{print $2}') RUNNING_PIDS=$(lsof -i:29500 -Fp | head -n 1 | sed 's/^p//') @@ -139,6 +160,8 @@ function setParams() { else LLAMA_ARGS="${LLAMA_ARGS} --use-flash-attn-v2" fi + echo "Setting up AWS NCCL OFI Plugin on Polaris..." + source "${WORKING_DIR}/ALCF/aws_ofi_nccl_plugin.sh" || exit # +--------[Perlmutter]---------------------------------+ elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then TP="${TP:-2}" @@ -293,7 +316,7 @@ function setOutput() { export CKPT_DIR="checkpoints/${OUTPUT_PREFIX}" echo "${OUTPUT_LOG}" >> "logs/latest" mkdir -p "${OUTPUT_DIR}" - printf "Please see logs at: %s\n" $(printGreen "${OUTPUT_DIR}") + printf "\n Please see logs at: %s\n" $(printGreen "${OUTPUT_DIR}") printf "Checkpoints will be saved to: %s\n" $(printYellow "${CKPT_DIR}") } @@ -394,12 +417,13 @@ function setEnv() { echo "No conda_prefix or virtual_env found in environment..." echo "Setting up conda..." ######################## setup_conda ############################ - # ---- [SunSpot] ------- || ---- [Aurora] -------------- + # ---- [SunSpot @ ALCF] || [Aurora @ ALCF] --------------------- if [[ $(hostname) == x1* || $(hostname) == x4* ]]; then source "${WORKING_DIR}/ALCF/sunspot-env.sh" || exit # ----- [Aurora] -------------------------------------------- if [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then if [[ $(hostname) == x4* ]]; then + # TODO: Update once Aurora back online eval "$(conda shell.zsh hook)" && conda activate anl_release_q4v2 # ----- [SunSpot] --------------------------------------- elif [[ $(hostname) == x1* ]]; then @@ -407,7 +431,7 @@ function setEnv() { setup_conda_sunspot fi fi - # ----- [Polaris] ----------------------------------------------- + # ----- [Polaris @ ALCF] -------------------------------------------- elif [[ $(hostname) == x3* ]]; then if [[ "${PBS_O_HOST}" == sirius* ]]; then echo "Running on Sirius !!" @@ -417,8 +441,7 @@ function setEnv() { # ---- [load conda] ------------------------------------- setup_conda_polaris fi - echo "Setting up AWS NCCL OFI Plugin on Polaris..." - source "${WORKING_DIR}/ALCF/aws_ofi_nccl_plugin.sh" || exit + # ----- [Perlmutter @ NERSC] ---------------------------------------- elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then echo "Running on Perlmutter !!" module load pytorch @@ -431,10 +454,9 @@ function setEnv() { echo "Unable to setup python environment. Exiting" exit 1 fi - printf "\n" + ##################################################################### pystr="Using: $(which python3)" - printf "[python] %s" "$(printMagenta ${pystr})" - printf "\n" + printf "\n[python] %s\n" "$(printMagenta ${pystr})" } @@ -457,13 +479,13 @@ function makeHostfiles() { # Ensure `DATA_FILE_LIST` is set, # fallback to default values if necessary. ############################################### -function setData() { # ----------------------[dfl: abbrv. for DATA_FILE_LIST] +function setData() { # ------------------------[dfl: abbrv. for DATA_FILE_LIST] # dfldir="${WORKING_DIR}/ALCF/data-lists" # =====[Set DATA_FILE_LIST_FALLBACK based on current machine]============== - if [[ $(hostname) == x4* ]]; then # ---------------------------[AURORA] + if [[ $(hostname) == x4* ]]; then # -----------------------------[AURORA] dfl_fallback="/home/foremans/anl_24_release_q4/llm.devkit/Megatron-DeepSpeed/data_file_list_reweighted.txt" - elif [[ $(hostname) == x1* ]]; then # --------------------------[SUNSPOT] + elif [[ $(hostname) == x1* ]]; then # ----------------------------[SUNSPOT] # shellcheck: source ./data-lists/sunspot/books.txt dfl_fallback="${WORKING_DIR}/ALCF/data-lists/sunspot/books.txt" diff --git a/train_aGPT_7B.sh b/train_aGPT_7B.sh index 0c192b1b4f..6bc8f1f9dc 100644 --- a/train_aGPT_7B.sh +++ b/train_aGPT_7B.sh @@ -11,24 +11,31 @@ fi NOW="$(date "+%Y-%m-%d-%H%M%S")" cd "${PBS_O_WORKDIR}" || exit -nhosts=$(wc -l < "${HOSTFILE}") +export nhosts=$(wc -l < "${PBS_NODEFILE}") -if [[ "${nhosts}" == 1 || "${nhosts}" == 2 ]]; then - MBS=1 +if [[ "${nhosts}" == 1 ]]; then + export MBS=1 +elif [[ "${nhosts}" == 2 ]]; then + export MBS=1 elif [[ "${nhosts}" -ge 2 ]]; then - MBS=2 + export MBS=2 elif [[ "${nhosts}" -ge 8 ]]; then - MBS=4 + export MBS=4 fi -printf "Detected %s hosts. Running with micro_batch:\n" ${nhosts} ${MBS} +# printf "Detected %s hosts. Running with micro_batch:\n" ${nhosts} ${MBS} OUTDIR="${PBS_O_WORKDIR}/pbslogs" mkdir -p "${OUTDIR}" OUTFILE="${OUTDIR}/${PBS_JOBID}-${NOW}.log" -echo "Running on: ${MACHINE}" -echo "${OUTFILE}" >> "${OUTDIR}/latest" -echo "Logging job output to: ${OUTFILE}" + +echo "+---------------------------------------------------------+" +echo "| Running on: ${MACHINE}" +echo "| Detected ${nhosts} hosts. Running with micro batch: ${MBS}" +echo "| Logging job output to: ${OUTFILE}" +echo "+---------------------------------------------------------+" + export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=6000 +echo "${OUTFILE}" >> "${OUTDIR}/latest" # export DEBUG=1 MICRO_BATCH="${MBS}" DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/${MACHINE}/dolma_v1_7_file_list.txt" bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index 3e0f9390cb..237bcf7c15 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -34,16 +34,17 @@ export EXEC="${HERE}/pretrain_gpt_alcf.py" sourceFile "${HERE}/ALCF/helpers.sh" || exit # ----[3. Call fns from `./ALCF/helpers_alcf.sh`]------------------------------ -setEnv || exit # 1. load `conda` environment -# saveDSenv || exit # 2. save env vars to `.deepspeed_env` -ezpz || exit # 3. determine WORLD_SIZE, etc. from `PBS_*` vars -setParams || exit # 5. set command line arguments to pass to `"${EXEC}"` -buildDSconfig || exit # 6. create `deepspeed_config.json` from runtime params from ^ -setOutput || exit # 7. specify output directory for {logs, checkpoints, etc.} -setArgs || exit # 8. specify additional `deepspeed` arguments -setData "${DATA_FILE_LIST-}" || exit # 9. specify `DATA_FILE_LIST` for dolma dataset -printJobInfo || exit # 11. print job info -setupLauncher || exit +get_machine || exit # 01. Identify machine we're on +setEnv || exit # 02. Load `conda` environment +# saveDSenv || exit # 03. Save env vars to `.deepspeed_env` +ezpz || exit # 04. Determine WORLD_SIZE, etc. from `PBS_*` vars +setParams || exit # 05. Set command line arguments to pass to `"${EXEC}"` +buildDSconfig || exit # 06. Create `deepspeed_config.json` from runtime params from ^ +setOutput || exit # 07. Specify output directory for {logs, checkpoints, etc.} +setArgs || exit # 08. Specify additional `deepspeed` arguments +setData "${DATA_FILE_LIST-}" || exit # 09. Specify `DATA_FILE_LIST` for dolma dataset +printJobInfo || exit # 11. Print job info +setupLauncher || exit # 12. set launcher to one of `MPICH` (default), or `deepspeed` # ----------------------------------------------------------------------------- #### [DEPRECATED] ########################################################### From 6b4ea4c7085c0588214dca07b28a9fa76b0f6657 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 16 May 2024 10:49:20 -0500 Subject: [PATCH 245/268] Update `ALCF/{helpers.sh,train_llama_alcf.sh}` --- ALCF/helpers.sh | 52 +++++++++++++++++++++++++++++++++++---------- train_llama_alcf.sh | 13 ++++++------ 2 files changed, 48 insertions(+), 17 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 1bd0febf24..2422ecbe7a 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -125,6 +125,18 @@ function setDSlauncher() { fi } +function set_lr_args() { + LR_ARGS="--lr ${LR} --lr-decay-style cosine" + if [[ -n "${LR_DECAY_ITERS:-}" ]]; then + LR_ARGS="${LR_ARGS} --lr-decay-iters ${LR_DECAY_ITERS}" + fi + if [[ -n "${LR_WARMUP_FRAC}" ]]; then + LR_ARGS="${LR_ARGS} --lr-warmup-fraction ${LR_WARMUP_FRAC}" + fi + echo "LR_ARGS: ${LR_ARGS}" + export LR_ARGS="${LR_ARGS}" +} + function setParams() { LLAMA_ARGS="" # +----[Parallelism Settings] -------------------------------------------+ @@ -194,9 +206,6 @@ function setParams() { export NUM_KV_HEAD=${NUM_KV_HEAD:-8} # GROUP ATTENTION export FFN_HIDDEN_SIZE=${FFN_HIDDEN_SIZE:-11008} # FFN HIDDEN SIZE # +---[Run Settings]------------------------------------------------------+ - export LR=${LR:-0.0003} # LEARNING_RATE - export LR_WARMUP_FRAC=${LR_WARMUP_FRAC:-0.05} # LEARNING RATE WARMUP - export LR_DECAY_ITERS=${LR_DECAY_ITERS:-320000} # LR DECAY ITERS export SEQ=${SEQ:-4096} # SEQ_LEN: 4096 export ZERO_STAGE=${ZERO_STAGE:-1} # ZERO OFFLOADING STAGE export MICRO_BATCH=${MICRO_BATCH:-8} # MICRO BATCH SIZE @@ -215,6 +224,11 @@ function setParams() { export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" # STRING FOR IDENTIFYING MODEL # +----[ADDITIONAL LLAMA SPECIFIC ARGUMENTS]------------------------------ export LLAMA_ARGS="${LLAMA_ARGS} --no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" + export LR=${LR:-0.0003} # LEARNING_RATE + export LR_WARMUP_FRAC=${LR_WARMUP_FRAC:-0.05} # LEARNING RATE WARMUP + # export LR_DECAY_ITERS=${LR_DECAY_ITERS:-320000} # LR DECAY ITERS + export LR_DECAY_ITERS=${LR_DECAY_ITERS:-} # LR DECAY ITERS + set_lr_args } @@ -305,10 +319,15 @@ function saveDSenv() { function setOutput() { # ---- Specify output location -------------------------------- - export OUTPUT_PREFIX="ws${WORLD_SIZE}_ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_opt${OPT}" + OUTPUT_PREFIX="ws${WORLD_SIZE}_ds_stage${ZERO_STAGE}_nl${NLAYERS}" + OUTPUT_PREFIX="${OUTPUT_PREFIX}_hs${HIDDEN}_mb${MICRO_BATCH}" + OUTPUT_PREFIX="${OUTPUT_PREFIX}_seq${SEQ}_gb${GLOBAL_BATCH}" + OUTPUT_PREFIX="${OUTPUT_PREFIX}_pp${PP}_tp${TP}_${DTYPE}_opt${OPT}" + OUTPUT_PREFIX="${OUTPUT_PREFIX}_lr${LR}_lwf${LR_WARMUP_FRAC}_ldi${LR_DECAY_ITERS}" if [[ -z "${NO_FLASH_ATTN:-}" ]]; then - export OUTPUT_PREFIX="${OUTPUT_PREFIX}_flash" + OUTPUT_PREFIX="${OUTPUT_PREFIX}_flash" fi + export OUTPUT_PREFIX="${OUTPUT_PREFIX}" # OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%Y%m%d-%H%M%S)_${WORLD_SIZE}_${HOSTNAME}" export OUTPUT_DIR="${OUTPUT_DIR}" @@ -351,11 +370,20 @@ function sumFiles() { ######################################################## # Setup / activate conda environment, ######################################################## -setup_conda_sunspot() { # mine is called `q4-drop` on Sunspot - if [[ -z "${CONDA_PREFIX-}" && -z "${VIRTUAL_ENV-}" ]]; then - shell_name=$(echo "${SHELL}" | tr "\/" "\t" | awk '{print $NF}') - eval "$(~/miniconda3/bin/conda shell.${shell_name} hook)" - conda activate q4-drop +setup_conda_sunspot() { + ###### check if CONDA_PREFIX non-empty ################ + if [[ -z "${CONDA_PREFIX-}" ]]; then + module use -a /home/jmitche1/anl_release/2024/q2 ; module load frameworks_2024_5_v2 + else + echo "Caught CONDA_PREFIX=${CONDA_PREFIX}" + fi + ###### check if VIRTUAL_ENV non-empty ################# + if [[ -z "${VIRTUAL_ENV:-}" ]]; then + DEFAULT_VENV_PATH=${WORKING_DIR}/venvs/frameworks_2024_5_v2 + if [[ -d "${DEFAULT_VENV_PATH}" ]]; then + echo "Caught virtual env at ${DEFAULT_VENV_PATH}!" + source "${WORKING_DIR}/${DEFAULT_VENV_PATH}/bin/activate" + fi else echo "Found existing python at: $(which python3)" fi @@ -456,7 +484,9 @@ function setEnv() { fi ##################################################################### pystr="Using: $(which python3)" - printf "\n[python] %s\n" "$(printMagenta ${pystr})" + printf "[python] %s" "$(printMagenta ${pystr})" + printf "\n" + export "PYTHON_EXEC=$(which python3)" } diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index 237bcf7c15..d877c19a06 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -75,6 +75,7 @@ mkdir -p "${TBDIR}" data_cache_path="${CKPT_DIR}/${DATA_CACHE_PATH}" mkdir -p "${data_cache_path}" module list +printenv > "${CKPT_DIR}/.env" if [[ "${TIMING_LOG_LEVEL}" -ge 1 ]]; then TIMING_STR="\ @@ -87,6 +88,8 @@ else fi + + # Take custom args custom_args=" $@" @@ -98,7 +101,6 @@ run_cmd=" --split 100,0,0 \ --log-interval 1 \ --no-bias-gelu-fusion \ - --lr-decay-style cosine \ --no-bias-dropout-fusion \ --no-masked-softmax-fusion \ --tokenizer-type Llama2Tokenizer \ @@ -107,7 +109,6 @@ run_cmd=" --use-checkpoint-opt_param-scheduler \ --log-timers-to-tensorboard \ --log-optimizer-states-to-tensorboard \ - --lr ${LR} \ --optimizer ${OPT} \ --save ${CKPT_DIR} \ --load ${CKPT_DIR} \ @@ -125,14 +126,13 @@ run_cmd=" --micro-batch-size ${MICRO_BATCH} \ --data-file-list ${DATA_FILE_LIST} \ --tensor-model-parallel-size ${TP} \ - --lr-decay-iters ${LR_DECAY_ITERS} \ --global-batch-size ${GLOBAL_BATCH} \ --pipeline-model-parallel-size ${PP} \ --num-key-value-heads ${NUM_KV_HEAD} \ --data-cache-path ${data_cache_path} \ --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ --tokenizer-model ${TOKENIZER_MODEL} \ - --lr-warmup-fraction ${LR_WARMUP_FRAC} \ + ${LR_ARGS} \ ${LLAMA_ARGS} \ ${TIMING_STR} \ $ds_args \ @@ -143,7 +143,8 @@ run_cmd=" check_and_kill_if_running || exit echo "${run_cmd}" -printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" -printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" +printf "[!! %s] View output at:\n %s\n" "$(printBlue "NOTE")" "$(printYellow ${OUTPUT_LOG})" +# printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" +# printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" eval "${run_cmd}" set +x From 14970b9a01a1055c143b1969dff89d6aed99bfb1 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 16 May 2024 11:54:21 -0500 Subject: [PATCH 246/268] Update `ALCF/helpers.sh` on Sunspot --- ALCF/helpers.sh | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 2422ecbe7a..9fe9934dab 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -372,18 +372,30 @@ function sumFiles() { ######################################################## setup_conda_sunspot() { ###### check if CONDA_PREFIX non-empty ################ - if [[ -z "${CONDA_PREFIX-}" ]]; then - module use -a /home/jmitche1/anl_release/2024/q2 ; module load frameworks_2024_5_v2 - else - echo "Caught CONDA_PREFIX=${CONDA_PREFIX}" + if [[ -z "${CONDA_PREFIX:-}" ]]; then + eval "$(~/miniconda3/bin/conda shell.zsh hook)" + conda activate anl_24_q2_release fi - ###### check if VIRTUAL_ENV non-empty ################# + # ------------------------------------------------------------------------ + # XXX: Jerome's `frameworks_2024_5_v2` seems broken ?? + # - seems to be missing `python3 -c 'from mpi4py import MPI'` ??? + # - consequently, we leave the setup below commented out (for the time + # being): + # if [[ -z "${CONDA_PREFIX-}" ]]; then + # module use -a /home/jmitche1/anl_release/2024/q2 ; module load frameworks_2024_5_v2 + # else + # echo "Caught CONDA_PREFIX=${CONDA_PREFIX}" + # fi + # + # ------------------------------------------------------------------------ + ###### check if VIRTUAL_ENV non-empty #################################### if [[ -z "${VIRTUAL_ENV:-}" ]]; then - DEFAULT_VENV_PATH=${WORKING_DIR}/venvs/frameworks_2024_5_v2 - if [[ -d "${DEFAULT_VENV_PATH}" ]]; then - echo "Caught virtual env at ${DEFAULT_VENV_PATH}!" - source "${WORKING_DIR}/${DEFAULT_VENV_PATH}/bin/activate" - fi + DEFAULT_VENV_PATH="${WORKING_DIR}/venvs/anl_24_q2_release" + # venvs/anl_24_q2_release/bin/activate + # if [[ -d "${DEFAULT_VENV_PATH}" ]]; then + echo "Caught virtual env at ${DEFAULT_VENV_PATH}!" + source "${DEFAULT_VENV_PATH}/bin/activate" || exit + # fi else echo "Found existing python at: $(which python3)" fi @@ -447,7 +459,6 @@ function setEnv() { ######################## setup_conda ############################ # ---- [SunSpot @ ALCF] || [Aurora @ ALCF] --------------------- if [[ $(hostname) == x1* || $(hostname) == x4* ]]; then - source "${WORKING_DIR}/ALCF/sunspot-env.sh" || exit # ----- [Aurora] -------------------------------------------- if [[ -z "${conda_prefix}" && -z "${virtual_env}" ]]; then if [[ $(hostname) == x4* ]]; then @@ -459,6 +470,7 @@ function setEnv() { setup_conda_sunspot fi fi + source "${WORKING_DIR}/ALCF/sunspot-env.sh" || exit # ----- [Polaris @ ALCF] -------------------------------------------- elif [[ $(hostname) == x3* ]]; then if [[ "${PBS_O_HOST}" == sirius* ]]; then From d1aec5d7afe4ca68196e325f32bd185584caaf9a Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 16 May 2024 11:54:55 -0500 Subject: [PATCH 247/268] Update `ALCF/sunspot-env.sh` with new modules for `anl_24_q2_release` --- ALCF/sunspot-env.sh | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/ALCF/sunspot-env.sh b/ALCF/sunspot-env.sh index e97545aafe..8b02542b20 100644 --- a/ALCF/sunspot-env.sh +++ b/ALCF/sunspot-env.sh @@ -1,27 +1,8 @@ #!/bin/bash --login - -export CCL_OP_SYNC=1 # Required by current oneCCL (HPCS-8067) -export CCL_PROCESS_LAUNCHER=pmix # Required by Aurora mpich -export FI_PROVIDER=cxi # Required by Aurora mpich -export PALS_PMI=pmix # Required by Aurora mpich -export CCL_ATL_TRANSPORT=mpi # Required by Aurora mpich -export FI_MR_CACHE_MONITOR=disabled # Required by Aurora mpich (HPCS-6501) -export CCL_SKIP_SCHEDULER=1 # Required by current oneCCL, will remove when set by default -export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 -export CCL_TOPO_COLOR="card:{0,1},{2,3},{4,5},{6,7},{8,9},{10,11};plane:{0,3,4,6,8,11},{1,2,5,7,9,10}" -export UR_L0_IN_ORDER_BARRIER_BY_SIGNAL=0 # Required by current oneCCL - - -export LLM_DK_DIR=/home/$(whoami)/q4-drop_sunspot/llm.devkit - -module load oneapi/release/2023.12.15.001 -unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE -unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE -unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE - - +# module use /home/ftartagl/graphics-compute-runtime/modulefiles -module load graphics-compute-runtime/agama-ci-devel-736.9 -source /home/$(whoami)/q4-drop_sunspot/llm.devkit/torch-ccl/third_party/oneCCL/build/_install/env/vars.sh -module load gcc/12.1.0 -module unload intel_compute_runtime/release/agama-devel-647 +module load graphics-compute-runtime/agama-ci-devel-803.29 +module load spack-pe-gcc/0.6.1-23.275.2 +module load gcc/12.2.0 +module use /soft/preview-modulefiles/24.086.0 +module load oneapi/release/2024.04.15.001 From 13700cfc05ef71f6052cf876f8431d5d2eba58bc Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 17 May 2024 07:19:09 -0500 Subject: [PATCH 248/268] Update `ALCF/helpers.sh` with new release on Sunspot --- ALCF/helpers.sh | 88 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 70 insertions(+), 18 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 9fe9934dab..a805a34e28 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -229,6 +229,15 @@ function setParams() { # export LR_DECAY_ITERS=${LR_DECAY_ITERS:-320000} # LR DECAY ITERS export LR_DECAY_ITERS=${LR_DECAY_ITERS:-} # LR DECAY ITERS set_lr_args + if [[ "${TIMING_LOG_LEVEL}" -ge 1 ]]; then + TIMING_STR="\ + --timing-log-level ${TIMING_LOG_LEVEL} \ + --log-timers-to-tensorboard \ + --log-optimizer-states-to-tensorboard \ + " + else + TIMING_STR="" + fi } @@ -323,7 +332,10 @@ function setOutput() { OUTPUT_PREFIX="${OUTPUT_PREFIX}_hs${HIDDEN}_mb${MICRO_BATCH}" OUTPUT_PREFIX="${OUTPUT_PREFIX}_seq${SEQ}_gb${GLOBAL_BATCH}" OUTPUT_PREFIX="${OUTPUT_PREFIX}_pp${PP}_tp${TP}_${DTYPE}_opt${OPT}" - OUTPUT_PREFIX="${OUTPUT_PREFIX}_lr${LR}_lwf${LR_WARMUP_FRAC}_ldi${LR_DECAY_ITERS}" + OUTPUT_PREFIX="${OUTPUT_PREFIX}_lr${LR}_lwf${LR_WARMUP_FRAC}" + if [[ -n "${LR_DECAY_ITERS}" ]]; then + OUTPUT_PREFIX="${OUTPUT_PREFIX}_ldi${LR_DECAY_ITERS}" + fi if [[ -z "${NO_FLASH_ATTN:-}" ]]; then OUTPUT_PREFIX="${OUTPUT_PREFIX}_flash" fi @@ -373,29 +385,39 @@ function sumFiles() { setup_conda_sunspot() { ###### check if CONDA_PREFIX non-empty ################ if [[ -z "${CONDA_PREFIX:-}" ]]; then - eval "$(~/miniconda3/bin/conda shell.zsh hook)" - conda activate anl_24_q2_release + # eval "$(~/miniconda3/bin/conda shell.zsh hook)" + # conda activate anl_24_q2_release + module use /soft/preview-modulefiles/24.086.0 ; module load frameworks/2024.04.15.002.lua fi - # ------------------------------------------------------------------------ - # XXX: Jerome's `frameworks_2024_5_v2` seems broken ?? + # XXX: ------------------------------------------------------------------- + # Jerome's `frameworks_2024_5_v2` seems broken ?? # - seems to be missing `python3 -c 'from mpi4py import MPI'` ??? # - consequently, we leave the setup below commented out (for the time # being): - # if [[ -z "${CONDA_PREFIX-}" ]]; then - # module use -a /home/jmitche1/anl_release/2024/q2 ; module load frameworks_2024_5_v2 - # else - # echo "Caught CONDA_PREFIX=${CONDA_PREFIX}" - # fi - # + # if [[ -z "${CONDA_PREFIX-}" ]]; then + # module use -a /home/jmitche1/anl_release/2024/q2 ; module load frameworks_2024_5_v2 + # else + # echo "Caught CONDA_PREFIX=${CONDA_PREFIX}" + # fi # ------------------------------------------------------------------------ + ###### check if VIRTUAL_ENV non-empty #################################### + # venvs/anl_24_q2_release/bin/activate + # if [[ -d "${DEFAULT_VENV_PATH}" ]]; then if [[ -z "${VIRTUAL_ENV:-}" ]]; then - DEFAULT_VENV_PATH="${WORKING_DIR}/venvs/anl_24_q2_release" - # venvs/anl_24_q2_release/bin/activate - # if [[ -d "${DEFAULT_VENV_PATH}" ]]; then - echo "Caught virtual env at ${DEFAULT_VENV_PATH}!" - source "${DEFAULT_VENV_PATH}/bin/activate" || exit - # fi + if [[ -n "${CONDA_PREFIX}" ]]; then + VENV_DIR="${WORKING_DIR}/venvs/$(echo ${CONDA_PREFIX} | tr '\/' '\t' | awk '{print $NF}')" + else + VENV_DIR="${WORKING_DIR}/venvs/anl_24_q2_release" + fi + echo "Caught virtual env at ${VENV_DIR}!" + # source "${VENV_DIR}/bin/activate" || + if [[ ! -f "${VENV_DIR}/bin/activate" ]]; then + printf "[!! %s]: Unable to locate %s\n" "$(printRed "ERROR")" "$(printMagenta "${VENV_DIR}/bin/activate")" + # echo "[!ERROR]: Unable to locate ${VENV_DIR}/bin/activate !!" + else + source "${VENV_DIR}/bin/activate" + fi else echo "Found existing python at: $(which python3)" fi @@ -415,6 +437,10 @@ setup_conda_sirius() { fi } +setup_venv_from_conda() { + source "venvs/$(echo ${CONDA_PREFIX} | tr '\/' '\t' | awk '{print $NF}')/bin/activate" +} + ######################## # Setup conda on Polaris ######################## @@ -470,7 +496,12 @@ function setEnv() { setup_conda_sunspot fi fi - source "${WORKING_DIR}/ALCF/sunspot-env.sh" || exit + # MPICH_MODULES=$(echo $LOADEDMODULES | tr ':' '\n' | grep mpich) + # if [[ -z "${MPICH_MODULES" ]]; then + # source "${WORKING_DIR}/ALCF/sunspot-env.sh" || exit + # else + # echo "Caught MPICH_MODULES: ${MPICH_MODULES}" + # fi # ----- [Polaris @ ALCF] -------------------------------------------- elif [[ $(hostname) == x3* ]]; then if [[ "${PBS_O_HOST}" == sirius* ]]; then @@ -764,3 +795,24 @@ function printCyan() { function printWhite() { printf "\e[1;37m%s\e[0m\n" "$@" } + +#### [DEPRECATED] ########################################################### +# if [[ -z "${HOSTFILE}" ]]; then +# makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` +# else +# echo "!! USING CUSTOM HOSTFILE FROM: ${HOSTFILE}" +# fi +# ---------------------------------------------------------------------------- +# setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` +# ---------------------------------------------------------------------------- +# TORCH_DEVICE=$(python3 -c 'import ezpz as ez; print(ez.get_torch_device())') +# printf %s "Using TORCH_DEVICE=${TORCH_DEVICE}" +# if [[ "${TORCH_DEVICE}" == "cuda" ]]; then +# printf %s "Setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True" +# PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +# fi +# ---------------------------------------------------------------------------- +# export MPICH_GPU_SUPPORT_ENABLED=1 +# export CUDA_DEVICE_MAX_CONNECTIONS=1 +# export NCCL_DEBUG=INFO +############################################################################# From 530c7c80cee7ff10a01b432fed01e5e642fc0cd2 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 17 May 2024 07:19:30 -0500 Subject: [PATCH 249/268] Update `train_llama_alcf.sh` --- train_llama_alcf.sh | 65 +++++++++++++++++---------------------------- 1 file changed, 24 insertions(+), 41 deletions(-) diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index d877c19a06..b1d4ed9825 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -5,12 +5,25 @@ #PBS -l select=48 #PBS -l filesystems=eagle:home -if [[ -n "${DEBUG-}" ]]; then +#### Make it easy to track experiments by date ################### +YEAR="$(date "+%Y")" +MONTH="$(date "+%m")" +DAY="$(date "+%Y-%m-%d")" +TODAY="$(date "+%Y-%m-%d")" # kept for backwards compatibility +STARTED_AT="$(date "+%Y-%m-%d-%H%M%S")" +################################################################## + +if [[ -n "${DEBUG-}" ]]; then # to use: `DEBUG=1 bash train_llama_alcf.sh` printf "\e[1;31m%s\e[0m\n" "!! RUNNING IN DEBUG MODE !!" set -euxo pipefail fi -function sourceFile() { +if [[ -v NOOP ]]; then # to use: `NOOP=1 bash train_llama_alcf.sh` + echo "Run NOOP mode" + set -o noexec # same as set -n +fi + +sourceFile() { fp="$1" echo "source-ing ${fp}" if [[ -f "${fp}" ]]; then @@ -21,7 +34,7 @@ function sourceFile() { fi } -# ----[0. Navigate into `$PBS_O_WORKDIR`]------------------------------------- +# ----[0. Navigate into `$PBS_O_WORKDIR`]-------------------------------------- cd "${PBS_O_WORKDIR}" || exit HERE=$(python3 -c 'import os; print(os.getcwd())') export HERE @@ -47,48 +60,18 @@ printJobInfo || exit # 11. Print job info setupLauncher || exit # 12. set launcher to one of `MPICH` (default), or `deepspeed` # ----------------------------------------------------------------------------- -#### [DEPRECATED] ########################################################### -# if [[ -z "${HOSTFILE}" ]]; then -# makeHostfiles || exit # 4. create `deepspeed` hostfile from `$PBS_NODEFILE` -# else -# echo "!! USING CUSTOM HOSTFILE FROM: ${HOSTFILE}" -# fi -# ---------------------------------------------------------------------------- -# setDSlauncher "${HERE}" || exit # 10. set `launcher` args for `deepspeed ${launcher} ${EXEC} ${args}` -# ---------------------------------------------------------------------------- -# TORCH_DEVICE=$(python3 -c 'import ezpz as ez; print(ez.get_torch_device())') -# printf %s "Using TORCH_DEVICE=${TORCH_DEVICE}" -# if [[ "${TORCH_DEVICE}" == "cuda" ]]; then -# printf %s "Setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True" -# PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True -# fi -# ---------------------------------------------------------------------------- -# export MPICH_GPU_SUPPORT_ENABLED=1 -# export CUDA_DEVICE_MAX_CONNECTIONS=1 -# export NCCL_DEBUG=INFO -############################################################################# - -# Assert TBDIR exists inside our $CKPT_DIR +################################################ +# Assert `$TBDIR` exists inside our `$CKPT_DIR` +# to ensure metrics are tied to checkpoint +################################################ TBDIR="${CKPT_DIR}/tensorboard" mkdir -p "${TBDIR}" -data_cache_path="${CKPT_DIR}/${DATA_CACHE_PATH}" -mkdir -p "${data_cache_path}" -module list -printenv > "${CKPT_DIR}/.env" - -if [[ "${TIMING_LOG_LEVEL}" -ge 1 ]]; then - TIMING_STR="\ - --timing-log-level ${TIMING_LOG_LEVEL} \ - --log-timers-to-tensorboard \ - --log-optimizer-states-to-tensorboard \ - " -else - TIMING_STR="" -fi - - +data_cache_path="${CKPT_DIR}/${DATA_CACHE_PATH}" && mkdir -p "${data_cache_path}" +# Print info about loaded modules and runtime environment +module list +printenv |& tee "${CKPT_DIR}/.env" # Take custom args custom_args=" $@" From b8cb2e8bfd3459d2f08a6c55b9b07f9f20447ac6 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 17 May 2024 07:51:15 -0500 Subject: [PATCH 250/268] Update `train_aGPT_7B.sh` --- train_aGPT_7B.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/train_aGPT_7B.sh b/train_aGPT_7B.sh index 6bc8f1f9dc..d2719b088e 100644 --- a/train_aGPT_7B.sh +++ b/train_aGPT_7B.sh @@ -38,4 +38,6 @@ echo "+---------------------------------------------------------+" export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=6000 echo "${OUTFILE}" >> "${OUTDIR}/latest" # export DEBUG=1 -MICRO_BATCH="${MBS}" DATA_FILE_LIST="${PBS_O_WORKDIR}/ALCF/data-lists/${MACHINE}/dolma_v1_7_file_list.txt" bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" +MICRO_BATCH="${MICRO_BATCH:-${MBS}}" +DATA_FILE_LIST="${DATA_FILE_LIST:-${PBS_O_WORKDIR}/ALCF/data-lists/${MACHINE}/dolma_v1_7_file_list.txt}" +bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" From 0f180319cc9e48b4c52b32400a26297a91937c40 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 20 May 2024 09:43:45 -0500 Subject: [PATCH 251/268] Add `setup_venv_from_conda` fn to `ALCF/helpers.sh` --- ALCF/helpers.sh | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index a805a34e28..9e7e2b582b 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -438,7 +438,38 @@ setup_conda_sirius() { } setup_venv_from_conda() { - source "venvs/$(echo ${CONDA_PREFIX} | tr '\/' '\t' | awk '{print $NF}')/bin/activate" + if [[ -z "${CONDA_PREFIX}" ]]; then + echo "No ${CONDA_PREFIX} found." # Exiting." + # exit 1 + else + if [[ -n "${VIRTUAL_ENV}" ]]; then + echo "Already inside virtual env at ${VENV_DIR}!" + elif [[ -z "${VIRTUAL_ENV}" ]]; then + echo "No VIRTUAL_ENV found in environment!" + echo " - Trying to setup from ${CONDA_PREFIX}" + CONDA_NAME=$(echo ${CONDA_PREFIX} | tr '\/' '\t' | sed -E 's/mconda3|\/base//g' | awk '{print $NF}') + VENV_DIR="${WORKING_DIR}/venvs/${CONDA_NAME}" + echo " - Using VENV_DIR=${VENV_DIR}" + # VENV_DIR="venvs/$(echo ${CONDA_PREFIX} | tr '\/' '\t' | sed -E 's/mconda3|\/base//g' | awk '{print $NF}')" + # VENV_DIR="${WORKING_DIR}/venvs/$(echo ${CONDA_PREFIX} | tr '\/' '\t' | awk '{print $NF}')" + # VENV_DIR="${WORKING_DIR}/venvs/anl_24_q2_release" + # if [[ -f "${VENV_DIR}/bin/activate" ]]; then + if [[ ! -f "${VENV_DIR}/bin/activate" ]]; then + printf "\n - Creating a new virtual env on top of %s in %s" "$(printBlue "${CONDA_NAME}")" "$(printGreen "${VENV_DIR}")" + mkdir -p "${VENV_DIR}" + python3 -m venv "${VENV_DIR}" --system-site-packages + source "${VENV_DIR}/bin/activate" || exit + elif [[ -f "${VENV_DIR}/bin/activate" ]]; then + echo " - Found existing venv, activating from $(printBlue "${VENV_DIR}")" + source "${VENV_DIR}/bin/activate" + else + printf "\n [!! %s]: Unable to locate %s\n" "$(printRed "ERROR")" "$(printMagenta "${VENV_DIR}/bin/activate")" + fi + fi + # else + # printf "[!! %s]: Unable to locate %s\n" "$(printRed "ERROR")" "$(printMagenta "${VENV_DIR}/bin/activate")" + fi + } ######################## From 061e2cc9e0f4a6c98e857b850f0113dc310f1b09 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 20 May 2024 09:44:17 -0500 Subject: [PATCH 252/268] Update `train_aGPT_7B.sh` --- train_aGPT_7B.sh | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/train_aGPT_7B.sh b/train_aGPT_7B.sh index d2719b088e..0907f8163c 100644 --- a/train_aGPT_7B.sh +++ b/train_aGPT_7B.sh @@ -1,30 +1,34 @@ #!/bin/bash --login # + +NOW="$(date "+%Y-%m-%d-%H%M%S")" +cd "${PBS_O_WORKDIR}" || exit + HOSTNAME=$(hostname) if [[ "${HOSTNAME}" == x3* ]]; then MACHINE="polaris" + # XXX: ¯\_(ツ)_/¯ + # - On Polaris, we see that: + # - on 1 or 2 nodes, only MICRO_BATCH=1 will fit in memory + # - on 8 nodes, MICRO_BATCH=2 will fit in memory + # - on 48 nodes, MICRO_BATCH=4 will fit in memory + # + export nhosts=$(wc -l < "${PBS_NODEFILE}") + if [[ "${nhosts}" == 1 ]]; then + export MBS=1 + elif [[ "${nhosts}" == 2 ]]; then + export MBS=1 + elif [[ "${nhosts}" -ge 3 ]]; then + export MBS=2 + elif [[ "${nhosts}" -ge 8 ]]; then + export MBS=4 + fi elif [[ "${HOSTNAME}" == x1* ]]; then MACHINE="sunspot" elif [[ "${HOSTNAME}" == x4* ]]; then MACHINE="aurora" fi -NOW="$(date "+%Y-%m-%d-%H%M%S")" -cd "${PBS_O_WORKDIR}" || exit -export nhosts=$(wc -l < "${PBS_NODEFILE}") - -if [[ "${nhosts}" == 1 ]]; then - export MBS=1 -elif [[ "${nhosts}" == 2 ]]; then - export MBS=1 -elif [[ "${nhosts}" -ge 2 ]]; then - export MBS=2 -elif [[ "${nhosts}" -ge 8 ]]; then - export MBS=4 -fi - -# printf "Detected %s hosts. Running with micro_batch:\n" ${nhosts} ${MBS} - OUTDIR="${PBS_O_WORKDIR}/pbslogs" mkdir -p "${OUTDIR}" OUTFILE="${OUTDIR}/${PBS_JOBID}-${NOW}.log" @@ -38,6 +42,6 @@ echo "+---------------------------------------------------------+" export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=6000 echo "${OUTFILE}" >> "${OUTDIR}/latest" # export DEBUG=1 -MICRO_BATCH="${MICRO_BATCH:-${MBS}}" -DATA_FILE_LIST="${DATA_FILE_LIST:-${PBS_O_WORKDIR}/ALCF/data-lists/${MACHINE}/dolma_v1_7_file_list.txt}" +export MICRO_BATCH="${MICRO_BATCH:-${MBS}}" +export DATA_FILE_LIST="${DATA_FILE_LIST:-${PBS_O_WORKDIR}/ALCF/data-lists/${MACHINE}/dolma_v1_7_file_list.txt}" bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" From 47bf9b50088069f960a9ee52d4d535530427d8ae Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 20 May 2024 09:44:34 -0500 Subject: [PATCH 253/268] Update `train_llama_alcf.sh` --- train_llama_alcf.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index b1d4ed9825..fee2e392dc 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -71,7 +71,9 @@ data_cache_path="${CKPT_DIR}/${DATA_CACHE_PATH}" && mkdir -p "${data_cache_path} # Print info about loaded modules and runtime environment module list -printenv |& tee "${CKPT_DIR}/.env" +dotenv_file="${CKPT_DIR}/.env" +echo "Saving environment to ${dotenv_file}" +printenv | grep -v "LS_COLORS" > "${dotenv_file}" # Take custom args custom_args=" $@" From e68d270faf4780018ae954f38e83b002adadb55f Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 20 May 2024 09:47:57 -0500 Subject: [PATCH 254/268] Update `ALCF/README.md` --- ALCF/README.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ALCF/README.md b/ALCF/README.md index d405986fdf..8c097c699a 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -1,4 +1,4 @@ -# Megatron-DeepSpeed @ ALCF +- [>] # Megatron-DeepSpeed @ ALCF ## ✅ TODOs @@ -82,13 +82,16 @@ To launch: +### Polaris + ```bash $ qsub -A -q debug -l select=2 -l walltime=01:00:00,filesystems=eagle:home -I $ cd /path/to/Megatron-DeepSpeed/ -# load your favorite {conda, venv} environment, requires: {pytorch, deepspeed} -# e.g. on Polaris: $ module load conda/2023-10-04 # ; conda activate cu118-pt221 ; unset PYTHONUSERBASE -$ export PBS_O_WORKDIR="$(pwd)" && DATA_FILE_LIST=./convergence_debug_small.txt DTYPE=bf16 OPT=adamw bash train_llama_alcf.sh +$ module use /soft/modulefiles +$ module load conda/2024-04-29 +$ conda activate base +$ export PBS_O_WORKDIR="$(pwd)" && DATA_FILE_LIST=./ALCF/data-lists/polaris/books.txt DTYPE=bf16 OPT=adamw bash train_llama_alcf.sh ```
[output]: From ac414a0479b69f4968405c682cd49700eb3faebb Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 20 May 2024 09:57:28 -0500 Subject: [PATCH 255/268] Update README.md --- ALCF/README.md | 60 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/ALCF/README.md b/ALCF/README.md index 8c097c699a..0b436792d9 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -1,4 +1,4 @@ -- [>] # Megatron-DeepSpeed @ ALCF +# Megatron-DeepSpeed @ ALCF ## ✅ TODOs @@ -80,19 +80,35 @@ > [`train_llama_alcf.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/train_llama_alcf.sh) is the main entry point for launching > distributed training on {Polaris, Aurora, Sunspot} @ ALCF. -To launch: - -### Polaris - -```bash -$ qsub -A -q debug -l select=2 -l walltime=01:00:00,filesystems=eagle:home -I -$ cd /path/to/Megatron-DeepSpeed/ -$ module load conda/2023-10-04 # ; conda activate cu118-pt221 ; unset PYTHONUSERBASE -$ module use /soft/modulefiles -$ module load conda/2024-04-29 -$ conda activate base -$ export PBS_O_WORKDIR="$(pwd)" && DATA_FILE_LIST=./ALCF/data-lists/polaris/books.txt DTYPE=bf16 OPT=adamw bash train_llama_alcf.sh -``` +To launch on Polaris @ ALCF: + +> [!IMPORTANT] +> **Launch Instructions** on Polaris @ ALCF +> +> 1. Request an interactive job with `qsub -I`: +> +> ```bash +> $ qsub -A -q debug -l select=2 -l walltime=01:00:00,filesystems=eagle:home -I +> ``` +> +> 2. Clone repo + navigate into it: +> ```sh +> $ cd /path/to/Megatron-DeepSpeed/ # or git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +> ``` +> +> 3. Load required modules: +> +> ```sh +> $ module use /soft/modulefiles +> $ module load conda/2024-04-29 +> $ conda activate base +> ``` +> +> 4. Launch: +> +> ```sh +> $ export PBS_O_WORKDIR="$(pwd)" && DATA_FILE_LIST=./ALCF/data-lists/polaris/books.txt DTYPE=bf16 OPT=adamw bash train_llama_alcf.sh +> ```
[output]: @@ -461,6 +477,8 @@ training ...
+ + + + ## 📝 Data Preprocessing
Data Pre-Processing: From 9aa7fabfc70ead7470183c31e9ea5c7449d6da08 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Mon, 20 May 2024 13:26:09 -0500 Subject: [PATCH 256/268] Fix path in `prof.export_chrome_trace()` from `pretrain_gpt_alcf.py` --- pretrain_gpt_alcf.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 750bd21e48..94186fea83 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -2,6 +2,7 @@ """Pretrain GPT""" +from pathlib import Path from mpi4py import MPI import os from rich import print @@ -581,9 +582,13 @@ def main(): data_post_process=data_post_process ) args = get_args() - prof.export_chrome_trace( - f"{args.tensorboard_dir}" - "/torch-trace-{RANK}-of-{WORLD_SIZE}.json" + assert args is not None + trace_output = Path(f"{args.tensorboard_dir}").joinpath( + f"torch-trace-{RANK}-of-{WORLD_SIZE}.json" + ) + prof.export_chrome_trace(trace_output.as_posix()) + log.info( + f'Saved trace output to: {trace_output.as_posix()}' ) else: model = pretrain( From 2f0154394bbdf3c64b4669f9d944645e2cdb8f2b Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Thu, 23 May 2024 17:49:32 -0500 Subject: [PATCH 257/268] Update README.md --- ALCF/README.md | 251 +++++++++++++++++++++++++++++++------------------ 1 file changed, 162 insertions(+), 89 deletions(-) diff --git a/ALCF/README.md b/ALCF/README.md index 0b436792d9..82577ab372 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -1,114 +1,84 @@ # Megatron-DeepSpeed @ ALCF -## ✅ TODOs -
-TODOs: +## 🆘 Getting Started -- [ ] Ensure / double check that optimizer settings from `ds_config.json` aren't being overwritten by some defaults in `megatron/arguments.py` - - [ ] specifically, `momentum, beta{1, 2}, etc` - -
Completed +> [!NOTE] +> [`train_llama_alcf.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/train_llama_alcf.sh) is the main entry point for launching +> distributed training on {Polaris, Aurora, Sunspot} @ ALCF. -- Continue runs on Polaris @ - - [x] 48 Nodes - - [x] 32 Nodes - - [x] 16 Nodes - - [x] 8 Nodes - - [x] 4 Nodes -- [x] Then, try re-creating ( / fixing) conda with `cuda==12.1` - - 😔, failed. - -- ~~‼️ Unable to save checkpoints with `torch==2.1` + `cuda==11.8`~~: - - Fixed in [a57a21f](https://github.com/argonne-lcf/Megatron-DeepSpeed/commit/a57a21f6b2a8abf847f5ef599e1b1edcb5a5e1b5) + -
🐛 Bug - - - Training progresses OK: +## 🏃‍♂️ Running - ```bash - [2024-03-07 15:27:02,646] [INFO] [timer.py:260:stop] epoch=0/micro_step=199/global_step=199, RunningAvgSamplesPerSec=58.730622229657506, CurrSamplesPerSec=61.35304005128382, MemAllocated=6.01GB, MaxMemAllocated=19.52GB - iteration 199/ 317892 | consumed samples: 152832 | consumed tokens: 625999872 | elapsed time per iteration (ms): 14287.5 | learning rate: 2.407E-04 | global batch size: 768 | lm loss: 5.905366E+00 | loss scale: 8192.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 53.753 | tokens per gpu per second (tgs): 1146.733 | TFLOPs: 69.85 | - [2024-03-07 15:27:15,063] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=4, lr=[0.000240653265864008, 0.000240653265864008], mom=[(0.9, 0.999), (0.9, 0.999)] - [2024-03-07 15:27:17,188] [INFO] [timer.py:260:stop] epoch=0/micro_step=200/global_step=200, RunningAvgSamplesPerSec=58.730745476291396, CurrSamplesPerSec=58.75503515561452, MemAllocated=6.01GB, MaxMemAllocated=19.52GB - iteration 200/ 317892 | consumed samples: 153600 | consumed tokens: 629145600 | elapsed time per iteration (ms): 14541.4 | learning rate: 2.407E-04 | global batch size: 768 | lm loss: 5.897035E+00 | loss scale: 8192.0 | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 52.815 | tokens per gpu per second (tgs): 1126.713 | TFLOPs: 68.63 | - saving checkpoint at iteration 200 to checkpoints/ds_stage2_nl32_hs4096_mb8_seq4096_gb768_pp1_tp2_fp16 - # ... - ``` +To launch on Polaris @ ALCF: - - Then crashes with: - ```python - Traceback (most recent call last): - Traceback (most recent call last): - File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/pretrain_gpt_alcf.py", line 575, in - model = main() - File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/pretrain_gpt_alcf.py", line 554, in main - model = pretrain( - File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 226, in pretrain - iteration = train(forward_step_func, - File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 1290, in train - save_checkpoint_and_time(iteration, model, optimizer, - File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/training.py", line 1151, in save_checkpoint_and_time - save_checkpoint(iteration, model, optimizer, opt_param_scheduler) - File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/checkpointing.py", line 259, in save_checkpoint - state_dict[UNIVERSAL_CHECKPOINT_INFO] = _universal_checkpoint_info(model) - File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/checkpointing.py", line 783, in _universal_checkpoint_info - info.update(model[0].universal_checkpoint_info()) - File "/lus/eagle/projects/datascience/foremans/tmp/Megatron-DeepSpeed/megatron/model/gpt_model.py", line 203, in universal_checkpoint_info - info[TP_REPLICATED_PARAMETER_PATTERNS] = self._get_tp_replicated_param_patterns() - File "/lus/eagle/projects/datascience/foremans/miniconda3/envs/polaris/2024-03-06/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1695, in __getattr__ - raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") - AttributeError: 'GPTModel' object has no attribute '_get_tp_replicated_param_patterns' - ``` - 🤔 +
⏳ Request an interactive job with qsub -I: + +```bash +qsub -A -
+
⬇️ Clone repo + navigate into it: -
+```bash +git clone "https://github.com/argonne-lcf/Megatron-DeepSpeed" +cd Megatron-DeepSpeed +```
+
🐍 Setup Python: + +```bash +module use /soft/modulefiles ; module load conda ; conda activate base +PBS_O_WORKDIR=$(pwd) source ALCF/helpers.sh && setup_venv_from_conda +``` + +- 🍋 Install [`ezpz`](https://github.com/saforem2/ezpz): + + ```bash + mkdir deps && git clone https://github.com/saforem2/ezpz deps/ezpz + python3 -m pip install -e deps/ezpz --require-virtualenv + ``` +
-## 🏃‍♂️ Running -> [!NOTE] -> [`train_llama_alcf.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/train_llama_alcf.sh) is the main entry point for launching -> distributed training on {Polaris, Aurora, Sunspot} @ ALCF. +
🚀 Launch: -To launch on Polaris @ ALCF: +In this case, train a ~ 2B Model (with 10 layers), +for 1000 iterations using the data file list in: -> [!IMPORTANT] -> **Launch Instructions** on Polaris @ ALCF -> -> 1. Request an interactive job with `qsub -I`: -> -> ```bash -> $ qsub -A -q debug -l select=2 -l walltime=01:00:00,filesystems=eagle:home -I -> ``` -> -> 2. Clone repo + navigate into it: -> ```sh -> $ cd /path/to/Megatron-DeepSpeed/ # or git clone https://github.com/argonne-lcf/Megatron-DeepSpeed -> ``` -> -> 3. Load required modules: -> -> ```sh -> $ module use /soft/modulefiles -> $ module load conda/2024-04-29 -> $ conda activate base -> ``` -> -> 4. Launch: -> -> ```sh -> $ export PBS_O_WORKDIR="$(pwd)" && DATA_FILE_LIST=./ALCF/data-lists/polaris/books.txt DTYPE=bf16 OPT=adamw bash train_llama_alcf.sh -> ``` +[`ALCF/data-lists/polaris/books.txt`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/data-lists/polaris/books.txt) + +with a micro-batch-size of 2, with the `torch.optim.AdamW` optimizer. + +**Note** that _any_ of the options in the + +[`setParams`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/helpers.sh#L140) + +function from + +[`ALCF/helpers.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/7d203596dbf14e048e756c5ee6705de7dcb22283/ALCF/helpers.sh) + +can be overridden dynamically at runtime using this technique. + +```bash +PBS_O_WORKDIR=$(pwd) DATA_FILE_LIST=./ALCF/data-lists/polaris/books.txt TRAIN_ITER=1000 NLAYERS=10 MICRO_BATCH=2 OPT=adamw bash train_llama_alcf.sh +```
[output]: @@ -477,6 +447,34 @@ training ...
+
+ + + + + + +
🚀 Launch: @@ -66,14 +100,8 @@ for 1000 iterations using the data file list in: with a micro-batch-size of 2, with the `torch.optim.AdamW` optimizer. -**Note** that _any_ of the options in the - -[`setParams`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/helpers.sh#L140) - -function from - -[`ALCF/helpers.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/7d203596dbf14e048e756c5ee6705de7dcb22283/ALCF/helpers.sh) - +**Note** that _any_ of the options in the [`setParams`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/main/ALCF/helpers.sh#L140) +function from [`ALCF/helpers.sh`](https://github.com/argonne-lcf/Megatron-DeepSpeed/blob/7d203596dbf14e048e756c5ee6705de7dcb22283/ALCF/helpers.sh) can be overridden dynamically at runtime using this technique. ```bash From b37174223577f1b6dcc3085f58df77424e3bc222 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 24 May 2024 06:23:09 -0500 Subject: [PATCH 259/268] Add `setup_tokenizer_and_data()` function to `ALCF/helpers.sh` --- ALCF/helpers.sh | 145 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 138 insertions(+), 7 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 9e7e2b582b..6a647952ed 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -16,6 +16,22 @@ export WORKING_DIR="${WORKING_DIR}" printf "Using WORKING_DIR: %s\n" ${WORKING_DIR} +save_dotenv() { + if [[ "$#" -ne 1 ]]; then + estr="[error]" + # echo "Expected exactly one argument, specifying outputdir. Received $#" + printf "%s Expected one argument (outdir). Received: %s" "$(printRed ${estr})" "$#" + else + outdir="$1" + module list + dotenv_file="${outdir}/.env" + echo "Saving environment to ${dotenv_file}" + printenv | grep -v "LS_COLORS" > "${dotenv_file}" + export DOTENV_FILE="${dotenv_file}" + fi +} + + function get_machine() { if [[ $(hostname) == x4* ]]; then machine="aurora" @@ -100,14 +116,14 @@ function loadCondaEnv() { function setupLauncher() { # outdir=$1 if [[ -n "${DIST_LAUNCH}" && ${LAUNCH_CMD:-"MPICH"} != "deepspeed" ]]; then - export LAUNCH_CMD="${DIST_LAUNCH} --genvall --cpu-bind depth -d 16 $(which python3) -Wignore ${EXEC}" + export LAUNCHER="${DIST_LAUNCH} --genvall --cpu-bind depth -d 16 $(which python3) -Wignore ${EXEC}" else # Assert `./hostfile_deepspeed` exists export hfds="${WORKING_DIR}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit - export LAUNCH_CMD="deepspeed --hostfile $hfds --launcher MPICH ${EXEC}" + export LAUNCHER="deepspeed --hostfile $hfds --launcher MPICH ${EXEC}" fi - printf "%s" "$(printRed 'Launching with:')" - printf " %s" "$(printMagenta ${LAUNCH_CMD})" + printf "Launching with: %s\n" "$(printRed "${LAUNCH_CMD}")" + printf " %s" "$(printMagenta ${LAUNCHER})" } function setDSlauncher() { @@ -219,8 +235,8 @@ function setParams() { export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1} # USE ACTIVATION CHECKPOINTING ? export GLOBAL_BATCH_MAX=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) # MAX GLOBAL BATCH SIZE export GLOBAL_BATCH="${GLOBAL_BATCH:-${GLOBAL_BATCH_MAX}}" # WILL USE MAX IF NOT SET IN ENVIRONMENT - tm="${WORKING_DIR}/ALCF/tokenizer.model" # fallback: Megatron-DeepSpeed/ALCF/tokenizer.model - export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" # USE TOKENIZER_MODEL from env, else fallback from ^ + # tm="${WORKING_DIR}/ALCF/tokenizer.model" # fallback: Megatron-DeepSpeed/ALCF/tokenizer.model + # export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" # USE TOKENIZER_MODEL from env, else fallback from ^ export MODEL_TYPE="llama-seq${SEQ}-pp${PP}-tp${TP}-${NLAYERS}layers-${HEADS}heads-${HIDDEN}hidden" # STRING FOR IDENTIFYING MODEL # +----[ADDITIONAL LLAMA SPECIFIC ARGUMENTS]------------------------------ export LLAMA_ARGS="${LLAMA_ARGS} --no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear" @@ -498,6 +514,23 @@ setup_conda_polaris() { fi } +########################################################## +# Check that we can find the `.py` file we wish to launch +########################################################## +check_executable() { + fp=$1 + if [[ -f "${fp}" ]]; then + export EXEC="${EXEC}" + # ----[1.5 Keep track of stem from file path]------------------------- + exec_stem=$(echo "${EXEC}" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.py//g") + export EXEC_STEM="${exec_stem}" + else + estr="Unable to locate executable ${fp}" + printf "[ALCF.helpers:check_executable] %s" "$(printRed ${estr})" + fi +} + + function setEnv() { local virtual_env="${VIRTUAL_ENV-}" @@ -578,17 +611,109 @@ function makeHostfiles() { fi } +################################################## +# Setup tokenizer as either Llama2 or GPT2 style +################################################## +setup_tokenizer_and_data() { + # if [[ ${tok} == Llama* || ${tok} == llama* || ${tok} == LLAMA* ]]; then + # tm="${WORKING_DIR}/ALCF/tokenizer.model" # fallback: Megatron-DeepSpeed/ALCF/tokenizer.model + # export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" # USE TOKENIZER_MODEL from env, else fallback from ^ + # export TOKENIZER_TYPE="Llama2" + # setData + # elif [[ ${tok} == gpt* || ${tok} == GPT* ]]; then + # export DATA_PARENT="${DATA_PARENT:-/gila/Aurora_deployment/foremans/anl_24_q2_release/Megatron-DeepSpeed/dataset}" + # export VOCAB_FILE="${DATA_PARENT}/gpt2-vocab.json" + # export MERGE_FILE="${DATA_PARENT}/gpt2-merges.txt" + # export DATA_PATH="${DATA_PARENT}/BookCorpusDataset_text_document" + # export TOKENIZER_FLAGS="--data-path $DATA_PATH--vocab-file $VOCAB_FILE --merge-file $MERGE_FILE" + # # export TOKENIZER_TYPE="${TOKENIZER_TYPE:-GPT2}" + # export TOKENIZER_TYPE="GPT2" + # else + # echo "Unknown tokenizer ${tok} passed" + # fi + if [[ "$#" == 1 ]]; then + tok="$1" + dfl="${DATA_FILE_LIST:-}" + # echo "Setting up tokenizer with ${tok}" + # elif [[ "$#" -ne 2 ]]; then + elif [[ "$#" == 2 ]]; then + tok="$1" + dfl="$2" + # tok="${TOKENIZER_TYPE:-Llama2}" + else + echo "Incorrect number of arguments passed. Received: $#, expected 2" + fi + echo "Setting up tokenizer with ${tok}" + echo "Using data_file_list: ${dfl}" + if [[ ${tok} == gpt* || ${tok} == GPT* ]]; then + export TOKENIZER_TYPE="GPT2" + export TOKENIZER_FLAGS="--tokenizer-type GPT2BPETokenizer" + export DATA_PARENT="${DATA_PARENT:-/gila/Aurora_deployment/foremans/anl_24_q2_release/Megatron-DeepSpeed/dataset}" + export VOCAB_FILE="${DATA_PARENT}/gpt2-vocab.json" + export MERGE_FILE="${DATA_PARENT}/gpt2-merges.txt" + export DATA_PATH="${DATA_PARENT}/BookCorpusDataset_text_document" + # TOKENIZER_FLAGS="--data-path $DATA_PATH--vocab-file $VOCAB_FILE --merge-file ${MERGE_FILE}" + export DATA_FLAGS="--data-path ${DATA_PATH} --vocab-file ${VOCAB_FILE} --merge-file ${MERGE_FILE}" + # export TOKENIZER_TYPE="${TOKENIZER_TYPE:-GPT2}" + # else [[ ${tok} == Llama* || ${tok} == llama* || ${tok} == LLAMA* ]]; then + else + export DATA_FLAGS="" + export TOKENIZER_TYPE="Llama2" + tm="${WORKING_DIR}/ALCF/tokenizer.model" # fallback: Megatron-DeepSpeed/ALCF/tokenizer.model + export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" # USE TOKENIZER_MODEL from env, else fallback from ^ + export TOKENIZER_FLAGS="${TOKENIZER_FLAGS} --tokenizer-type Llama2Tokenizer --tokenizer-model ${TOKENIZER_MODEL}" + if [[ "${TOKENIZER_TYPE}" != "GPT2" ]]; then + echo "Using tokenizer: ${TOKENIZER_TYPE}. Setting up data with ${DATA_FILE_LIST-}" + setData "${dfl}" || exit + # setData "${DATA_FILE_LIST-}" || exit # 09. Specify `DATA_FILE_LIST` for dolma dataset + fi + # --tokenizer-model ${TOKENIZER_MODEL} \ + fi + # export DATA_FLAGS="${DATA_FLAGS}" + # export TOKENIZER_TYPE="${TOKENIZER_TYPE}" + # export TOKENIZER_FLAGS="${TOKENIZER_FLAGS}" + printf "[setData] DATA_FLAGS: %s\n" "$(printGreen ${DATA_FLAGS})" + printf "[setData] TOKENIZER_FLAGS: %s\n" "$(printMagenta ${TOKENIZER_FLAGS})" + # if [[ "${TOKENIZER_TYPE}" != "GPT2" ]]; then + # echo "Using tokenizer: ${TOKENIZER_TYPE}. Setting up data with ${DATA_FILE_LIST-}" + # setData "${DATA_FILE_LIST-}" || exit # 09. Specify `DATA_FILE_LIST` for dolma dataset + # fi +} + + ############################################### # `setData`: # Ensure `DATA_FILE_LIST` is set, # fallback to default values if necessary. ############################################### function setData() { # ------------------------[dfl: abbrv. for DATA_FILE_LIST] + # if [[ "$#" -ne 1 ]]; then + # tok="${TOKENIZER_TYPE:-Llama2}" + # else + # tok="$1" + # fi + # echo "Setting up tokenizer with ${tok}" + # setup_tokenizer "${tok}" + # tok="${TOKENIZER_TYPE:-}" + # if [[ ${tok} == gpt* || ${tok} == GPT* ]]; then + # export TOKENIZER_TYPE="GPT2" + # export DATA_PARENT="${DATA_PARENT:-/gila/Aurora_deployment/foremans/anl_24_q2_release/Megatron-DeepSpeed/dataset}" + # export VOCAB_FILE="${DATA_PARENT}/gpt2-vocab.json" + # export MERGE_FILE="${DATA_PARENT}/gpt2-merges.txt" + # export DATA_PATH="${DATA_PARENT}/BookCorpusDataset_text_document" + # # TOKENIZER_FLAGS="--data-path $DATA_PATH--vocab-file $VOCAB_FILE --merge-file ${MERGE_FILE}" + # DATA_FLAGS="--data-path ${DATA_PATH} --vocab-file ${VOCAB_FILE} --merge-file ${MERGE_FILE}" + # # export TOKENIZER_TYPE="${TOKENIZER_TYPE:-GPT2}" + # # else [[ ${tok} == Llama* || ${tok} == llama* || ${tok} == LLAMA* ]]; then + # else + # export TOKENIZER_TYPE="Llama2" + # tm="${WORKING_DIR}/ALCF/tokenizer.model" # fallback: Megatron-DeepSpeed/ALCF/tokenizer.model + # export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" # USE TOKENIZER_MODEL from env, else fallback from ^ + # TOKENIZER_FLAGS="${TOKENIZER_FLAGS} --tokenizer-type Llama2Tokenizer" # dfldir="${WORKING_DIR}/ALCF/data-lists" # =====[Set DATA_FILE_LIST_FALLBACK based on current machine]============== if [[ $(hostname) == x4* ]]; then # -----------------------------[AURORA] dfl_fallback="/home/foremans/anl_24_release_q4/llm.devkit/Megatron-DeepSpeed/data_file_list_reweighted.txt" - elif [[ $(hostname) == x1* ]]; then # ----------------------------[SUNSPOT] # shellcheck: source ./data-lists/sunspot/books.txt dfl_fallback="${WORKING_DIR}/ALCF/data-lists/sunspot/books.txt" @@ -623,6 +748,7 @@ function setData() { # ------------------------[dfl: abbrv. for DATA_FILE_LIST] export WEIGHT_SUM="${ws}" export DFL_STEM="${dfl_stem}" export DATA_CACHE_PATH="${dcp}" + export DATA_FLAGS="${DATA_FLAGS} --data-file-list ${DATA_FILE_LIST} --data-cache-path ${DATA_CACHE_PATH}" echo "--------------------" echo "Updated environment:" printf "DATA_FILE_LIST: %s\n" "${DATA_FILE_LIST}" @@ -631,6 +757,11 @@ function setData() { # ------------------------[dfl: abbrv. for DATA_FILE_LIST] printf "DFL_STEM: %s\n" "${DFL_STEM}" printf "DATA_CACHE_PATH: %s\n" "${DATA_CACHE_PATH}" echo "--------------------" + # fi + # export DATA_FLAGS="${DATA_FLAGS}" + # export TOKENIZER_FLAGS="${TOKENIZER_FLAGS}" + # printf "[setData] DATA_FLAGS: %s\n" "$(printGreen ${DATA_FLAGS})" + # printf "[setData] TOKENIZER_FLAGS: %s\n" "$(printMagenta ${TOKENIZER_FLAGS})" } function generateDSconfig() { From d93fb7f41eeda59a3c90f90984979da4db4212bf Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 24 May 2024 06:23:22 -0500 Subject: [PATCH 260/268] Update `train_llama_alcf.sh` --- train_llama_alcf.sh | 83 +++++++++++++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 29 deletions(-) diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index fee2e392dc..9a6d39923c 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -5,14 +5,26 @@ #PBS -l select=48 #PBS -l filesystems=eagle:home + #### Make it easy to track experiments by date ################### -YEAR="$(date "+%Y")" -MONTH="$(date "+%m")" -DAY="$(date "+%Y-%m-%d")" -TODAY="$(date "+%Y-%m-%d")" # kept for backwards compatibility -STARTED_AT="$(date "+%Y-%m-%d-%H%M%S")" +year="$(date "+%Y")" +month="$(date "+%m")" +day="$(date "+%Y-%m-%d")" +today="$(date "+%Y-%m-%d")" # kept for backwards compatibility +started_at="$(date "+%Y-%m-%d-%H%M%S")" +export YEAR="${year}" +export MONTH="${month}" +export DAY="${day}" +export TODAY="${today}" +export STARTED_AT="${started_at}" ################################################################## + +############################################################################# +# Check if running in `DEBUG=1` mode. +# - If so, this will print each command before it is ran and exit if any of +# them return a nonzero exit status. +############################################################################# if [[ -n "${DEBUG-}" ]]; then # to use: `DEBUG=1 bash train_llama_alcf.sh` printf "\e[1;31m%s\e[0m\n" "!! RUNNING IN DEBUG MODE !!" set -euxo pipefail @@ -23,6 +35,9 @@ if [[ -v NOOP ]]; then # to use: `NOOP=1 bash train_llama_alcf.sh` set -o noexec # same as set -n fi +################################################## +# Helper function for `source`-ing another file +################################################## sourceFile() { fp="$1" echo "source-ing ${fp}" @@ -34,6 +49,8 @@ sourceFile() { fi } +############################################################################## +###################### MAIN LOGIC ############################################ # ----[0. Navigate into `$PBS_O_WORKDIR`]-------------------------------------- cd "${PBS_O_WORKDIR}" || exit HERE=$(python3 -c 'import os; print(os.getcwd())') @@ -43,6 +60,11 @@ export HERE export EXEC="${HERE}/pretrain_gpt_alcf.py" [ -f "${EXEC}" ] || exit +# ----[1.5 Keep track of ] +exec_stem=$(echo "${EXEC}" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.py//g") +export EXEC_STEM="${exec_stem}" + + # ----[2. `source ./ALCF/helpers_alcf.sh`:]------------------------------------ sourceFile "${HERE}/ALCF/helpers.sh" || exit @@ -55,10 +77,15 @@ setParams || exit # 05. Set command line arguments to pass t buildDSconfig || exit # 06. Create `deepspeed_config.json` from runtime params from ^ setOutput || exit # 07. Specify output directory for {logs, checkpoints, etc.} setArgs || exit # 08. Specify additional `deepspeed` arguments -setData "${DATA_FILE_LIST-}" || exit # 09. Specify `DATA_FILE_LIST` for dolma dataset -printJobInfo || exit # 11. Print job info -setupLauncher || exit # 12. set launcher to one of `MPICH` (default), or `deepspeed` +dfl="${DATA_FILE_LIST:-}" # 09. Setup data + tokenizer +tok="${TOKENIZER_TYPE:-Llama2}" # via `DATA_FILE_LIST` and `TOKENIZER_TYPE` +setup_tokenizer_and_data "${tok}" "${dfl}" || exit +printJobInfo || exit # 10. Print job info +setupLauncher || exit # 11. set launcher to one of `MPICH` (default), or `deepspeed` +save_dotenv "${CKPT_DIR}" || exit # 12. Print info about loaded modules and runtime environment +check_and_kill_if_running || exit # 13. Check that were not already running, if so, exit. # ----------------------------------------------------------------------------- +############################################################################## ################################################ # Assert `$TBDIR` exists inside our `$CKPT_DIR` @@ -69,31 +96,33 @@ mkdir -p "${TBDIR}" data_cache_path="${CKPT_DIR}/${DATA_CACHE_PATH}" && mkdir -p "${data_cache_path}" -# Print info about loaded modules and runtime environment -module list -dotenv_file="${CKPT_DIR}/.env" -echo "Saving environment to ${dotenv_file}" -printenv | grep -v "LS_COLORS" > "${dotenv_file}" - -# Take custom args -custom_args=" $@" - # --log-num-zeros-in-grad \ - # --log-memory-to-tensorboard \ -run_cmd=" - ${LAUNCH_CMD} \ - --${DTYPE} \ +export DEFAULTS="\ --split 100,0,0 \ --log-interval 1 \ --no-bias-gelu-fusion \ --no-bias-dropout-fusion \ --no-masked-softmax-fusion \ - --tokenizer-type Llama2Tokenizer \ --no-gradient-accumulation-fusion \ --accumulate-allreduce-grads-in-fp32 \ --use-checkpoint-opt_param-scheduler \ --log-timers-to-tensorboard \ - --log-optimizer-states-to-tensorboard \ + --log-optimizer-states-to-tensorboard" + +# Take custom args +custom_args=" $@" + + # --log-num-zeros-in-grad \ + # --log-memory-to-tensorboard \ + # --data-file-list ${DATA_FILE_LIST} \ + # --data-file-list ${DATA_FILE_LIST} \ + # --data-cache-path ${data_cache_path} \ + # --tokenizer-type Llama2Tokenizer \ + # --tokenizer-model ${TOKENIZER_MODEL} \ +run_cmd=" + ${LAUNCHER} \ + --${DTYPE} \ + ${DEFAULTS} \ --optimizer ${OPT} \ --save ${CKPT_DIR} \ --load ${CKPT_DIR} \ @@ -109,27 +138,23 @@ run_cmd=" --eval-interval ${EVAL_INTERVAL} \ --max-position-embeddings ${SEQ} \ --micro-batch-size ${MICRO_BATCH} \ - --data-file-list ${DATA_FILE_LIST} \ --tensor-model-parallel-size ${TP} \ --global-batch-size ${GLOBAL_BATCH} \ --pipeline-model-parallel-size ${PP} \ --num-key-value-heads ${NUM_KV_HEAD} \ - --data-cache-path ${data_cache_path} \ --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ - --tokenizer-model ${TOKENIZER_MODEL} \ ${LR_ARGS} \ ${LLAMA_ARGS} \ ${TIMING_STR} \ + ${DATA_FLAGS} \ + ${TOKENIZER_FLAGS} \ $ds_args \ ${gpt_args[*]} \ $custom_args \ |& tee ${OUTPUT_LOG} " -check_and_kill_if_running || exit echo "${run_cmd}" printf "[!! %s] View output at:\n %s\n" "$(printBlue "NOTE")" "$(printYellow ${OUTPUT_LOG})" -# printf "[!! \e[1;31m%s\e[0m] View output at:\n" "NOTE" -# printf "\e[1;34m%s\e[0m\n" "${OUTPUT_LOG}" eval "${run_cmd}" set +x From 05d82c3911f2643d55ef3da5f497c9bfba70313b Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 24 May 2024 06:23:56 -0500 Subject: [PATCH 261/268] Update `train_aGPT_7B.sh` --- train_aGPT_7B.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/train_aGPT_7B.sh b/train_aGPT_7B.sh index 0907f8163c..e5035ae27b 100644 --- a/train_aGPT_7B.sh +++ b/train_aGPT_7B.sh @@ -7,13 +7,12 @@ cd "${PBS_O_WORKDIR}" || exit HOSTNAME=$(hostname) if [[ "${HOSTNAME}" == x3* ]]; then MACHINE="polaris" - # XXX: ¯\_(ツ)_/¯ + # XXX: # - On Polaris, we see that: # - on 1 or 2 nodes, only MICRO_BATCH=1 will fit in memory # - on 8 nodes, MICRO_BATCH=2 will fit in memory # - on 48 nodes, MICRO_BATCH=4 will fit in memory - # - export nhosts=$(wc -l < "${PBS_NODEFILE}") + nhosts=$(wc -l < "${PBS_NODEFILE}") if [[ "${nhosts}" == 1 ]]; then export MBS=1 elif [[ "${nhosts}" == 2 ]]; then @@ -29,6 +28,7 @@ elif [[ "${HOSTNAME}" == x4* ]]; then MACHINE="aurora" fi +export nhosts OUTDIR="${PBS_O_WORKDIR}/pbslogs" mkdir -p "${OUTDIR}" OUTFILE="${OUTDIR}/${PBS_JOBID}-${NOW}.log" From 6de8496e435fdf9a2c32ac5b627afecc62f1aa32 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 24 May 2024 06:24:21 -0500 Subject: [PATCH 262/268] Update `ALCF/README.md` --- ALCF/README.md | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/ALCF/README.md b/ALCF/README.md index 29169932d2..907cbd36cc 100644 --- a/ALCF/README.md +++ b/ALCF/README.md @@ -19,10 +19,10 @@ ## 🏃‍♂️ Running -To launch on Polaris @ [ALCF](https://alcf.anl.gov): +To launch on {`Polaris`, `Sunspot`} @ [ALCF](https://alcf.anl.gov):
⏳ Request an interactive job with qsub -I: - + ```bash qsub -A -q debug -l select=2 -l walltime=01:00:00,filesystems=eagle:home -I ``` @@ -42,14 +42,22 @@ cd Megatron-DeepSpeed 1. 📂 Load `conda` module and activate base environment: - ```bash - module use /soft/modulefiles ; module load conda ; conda activate base - ``` + - **Polaris**: + + ```bash + module use /soft/modulefiles ; module load conda ; conda activate base + ``` + + - **Sunspot**: + + ```bash + source ALCF/sunspot-env-2024-04-15-002.sh + ``` 3. 👻 Create virtual environment _on top of the base `conda`_[^venv]: ```bash - PBS_O_WORKDIR=$(pwd) source ALCF/helpers.sh && setup_venv_from_conda + export PBS_O_WORKDIR=$(pwd) && source ALCF/helpers.sh && setup_venv_from_conda ``` From 03aa7c11c0d5afd5c516b52b341b078c04c594fe Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 24 May 2024 08:22:23 -0500 Subject: [PATCH 263/268] Update `ALCF/helpers.sh` --- ALCF/helpers.sh | 265 ++++++++++++++++++++++++------------------------ 1 file changed, 130 insertions(+), 135 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 6a647952ed..a9af22416d 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -23,6 +23,7 @@ save_dotenv() { printf "%s Expected one argument (outdir). Received: %s" "$(printRed ${estr})" "$#" else outdir="$1" + mkdir -p "${outdir}" module list dotenv_file="${outdir}/.env" echo "Saving environment to ${dotenv_file}" @@ -32,7 +33,26 @@ save_dotenv() { } -function get_machine() { +where_am_i() { + if [[ $(hostname) == x4* ]]; then + machine="aurora" + elif [[ $(hostname) == x1* ]]; then + machine="sunspot" + elif [[ $(hostname) == x3* ]]; then + if [[ "${PBS_O_HOST}" == sirius* ]]; then + machine="sirius" + else + machine="polaris" + fi + elif [[ $(hostname) == nid* ]]; then + machine="perlmutter" + else + machine=$(hostname) + fi + echo "${machine}" +} + +get_machine() { if [[ $(hostname) == x4* ]]; then machine="aurora" elif [[ $(hostname) == x1* ]]; then @@ -53,7 +73,7 @@ function get_machine() { } -function check_and_kill_if_running() { +check_and_kill_if_running() { # kill $(ps aux | grep -E "$USER.+(mpi|main.py)" | grep -v grep | awk '{print $2}') RUNNING_PIDS=$(lsof -i:29500 -Fp | head -n 1 | sed 's/^p//') if [[ -n "${RUNNING_PIDS}" ]]; @@ -64,7 +84,7 @@ function check_and_kill_if_running() { } -function setupSrun() { +setupSrun() { if [[ $(hostname) == login* || $(hostname) == nid* ]]; then export NHOSTS="${SLURM_NNODES:-1}" export NGPU_PER_HOST="${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}" @@ -76,7 +96,7 @@ function setupSrun() { } -function printJobInfo() { +printJobInfo() { echo "++++++++++++++++++++++++++++++++++++++++++++++++++" echo "- MPICH_DIR=${MPICH_DIR:-${MPI_ROOT}}" echo "- Using $(which python3)" @@ -87,7 +107,7 @@ function printJobInfo() { echo "++++++++++++++++++++++++++++++++++++++++++++++++++" } -function setupVenv() { +setupVenv() { VENV_DIR="$1" if [[ -d "${VENV_DIR}" ]]; then echo "Found venv at: ${VENV_DIR}" @@ -97,7 +117,7 @@ function setupVenv() { fi } -function loadCondaEnv() { +loadCondaEnv() { if [[ "${CONDA_EXE}" ]]; then echo "Already inside ${CONDA_EXE}, exiting!" else @@ -113,7 +133,7 @@ function loadCondaEnv() { } -function setupLauncher() { +setupLauncher() { # outdir=$1 if [[ -n "${DIST_LAUNCH}" && ${LAUNCH_CMD:-"MPICH"} != "deepspeed" ]]; then export LAUNCHER="${DIST_LAUNCH} --genvall --cpu-bind depth -d 16 $(which python3) -Wignore ${EXEC}" @@ -126,7 +146,7 @@ function setupLauncher() { printf " %s" "$(printMagenta ${LAUNCHER})" } -function setDSlauncher() { +setDSlauncher() { # launcher setting outdir=$1 export hfds="$outdir/hostfile_deepspeed" @@ -141,7 +161,7 @@ function setDSlauncher() { fi } -function set_lr_args() { +set_lr_args() { LR_ARGS="--lr ${LR} --lr-decay-style cosine" if [[ -n "${LR_DECAY_ITERS:-}" ]]; then LR_ARGS="${LR_ARGS} --lr-decay-iters ${LR_DECAY_ITERS}" @@ -153,7 +173,22 @@ function set_lr_args() { export LR_ARGS="${LR_ARGS}" } -function setParams() { + +get_batch_size_on_polaris() { + if [[ $(hostname) == x3* ]]; then + local nhosts=$(wc -l < "${PBS_NODEFILE}") + if [[ "${nhosts}" == 1 || "${nhosts}" == 2 ]]; then + mbs=1 + elif [[ "${nhosts}" -ge 3 ]]; then + mbs=2 + elif [[ "${nhosts}" -ge 8 ]]; then + mbs=4 + fi + fi + echo "${mbs}" +} + +setParams() { LLAMA_ARGS="" # +----[Parallelism Settings] -------------------------------------------+ # +------[Aurora]--------||-------[SunSpot]-------------+ @@ -182,7 +217,8 @@ function setParams() { export DTYPE=${DTYPE:-fp16} # DTYPE: FP16 export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-8} # GRADIENT_ACC_STEPS # NOTE: MICRO_BATCH is exported below - MICRO_BATCH=${MICRO_BATCH:-2} # MICRO_BATCH = 8 + # MICRO_BATCH=${MICRO_BATCH:-2} # MICRO_BATCH = 8 + export MICRO_BATCH="${MICRO_BATCH:-$(get_batch_size_on_polaris)}" if [[ -n "${NO_FLASH_ATTN-}" ]]; then echo "Not using flash-attn!!" else @@ -257,7 +293,7 @@ function setParams() { } -function setArgs() { +setArgs() { # ---- Set DeepSpeed arguments -------------------------------- ds_args=" " ds_args=" --deepspeed ${ds_args}" @@ -288,7 +324,7 @@ function setArgs() { } -function make_ds_hostfile() { +make_ds_hostfile() { export GPUS_PER_NODE="${GPUS_PER_NODE:-${NGPU_PER_HOST:-${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}}}" # ---- Make MPICH hostfile ---------------- hf="${HOSTFILE:-${PBS_NODEFILE}}" @@ -304,33 +340,45 @@ function make_ds_hostfile() { # | 1. Git clone ezpz (if not found) | # | 2. Install ezpz (if not installed) | # +---------------------------------------+ -function ezpz() { +ezpz() { if [[ ! -d "${WORKING_DIR}/deps/ezpz" ]]; then mkdir -p "${WORKING_DIR}/deps" git clone https://github.com/saforem2/ezpz "${WORKING_DIR}/deps/ezpz" else echo "Found ezpz!" fi - echo "Done with clone. Now, checking if ezpz is installed..." - # if python3 -c 'import ezpz; print(ezpz.__file__)' 2> '/dev/null'; then - if python3 -c "import sys; any(['ezpz' in s for s in sys.path])" 2> '/dev/null'; then - echo "Has ezpz installed. Nothing to do." + ezloc=$(python3 -m pip list | grep ezpz | awk '{print $NF}') + if [[ -n "${ezloc}" ]]; then + echo "ezpz detected. Sourcing ${ezloc}/bin/savejobenv" + source "${ezloc}/src/ezpz/bin/savejobenv" > /dev/null 2>&1 + source "${ezloc}/src/ezpz/bin/getjobenv" || exit + make_ds_hostfile || exit else - echo "Does not have ezpz installed. Installing..." - echo "Using $(which python3) to install ezpz:" - python3 -m pip install -e "${WORKING_DIR}/deps/ezpz" # > ezpz-install.log 2>&1 + echo "No ezpz detected. Attempting to install with $(which python3)" + python3 -m pip install -e "${WORKING_DIR}/deps/ezpz" --require-virtualenv fi echo "Done with ezpz." - source ${WORKING_DIR}/deps/ezpz/src/ezpz/bin/savejobenv > /dev/null 2>&1 #> /tmp/savejobenv.log 2>&1 || exit - source ${WORKING_DIR}/deps/ezpz/src/ezpz/bin/getjobenv || exit - make_ds_hostfile || exit + # echo "Done with clone. Now, checking if ezpz is installed..." + # if python3 -c 'import ezpz; print(ezpz.__file__)' 2> '/dev/null'; then + # if [[ $(python3 -c "import sys; any(['ezpz' in s for s in sys.path])") 2> '/dev/null' ]]; then + # echo "Has ezpz installed. Nothing to do." + # else + # echo "Does not have ezpz installed. Installing..." + # echo "Using $(which python3) to install ezpz:" + # python3 -m pip install -e "${WORKING_DIR}/deps/ezpz" --verbose --require-virtualenv # > ezpz-install.log 2>&1 + # fi + # python3 -m pip install -e "${WORKING_DIR}/deps/ezpz" --verbose --require-virtualenv + # # echo "Done with ezpz." + # source ${WORKING_DIR}/deps/ezpz/src/ezpz/bin/savejobenv > /dev/null 2>&1 #> /tmp/savejobenv.log 2>&1 || exit + # source ${WORKING_DIR}/deps/ezpz/src/ezpz/bin/getjobenv || exit + # make_ds_hostfile || exit } # +------------------------------------------------------------------------+ # | Save important environment variables to .deepspeed_env, which will be | # | forwarded to ALL ranks with DeepSpeed | # +------------------------------------------------------------------------+ -function saveDSenv() { +saveDSenv() { echo "Saving {PATH, LD_LIBRARY_PATH, htt{p,ps}_proxy, CFLAGS, PYTHONUSERBASE} to .deepspeed_env" { echo "PATH=${PATH}" ; @@ -342,7 +390,7 @@ function saveDSenv() { } > .deepspeed_env } -function setOutput() { +setOutput() { # ---- Specify output location -------------------------------- OUTPUT_PREFIX="ws${WORLD_SIZE}_ds_stage${ZERO_STAGE}_nl${NLAYERS}" OUTPUT_PREFIX="${OUTPUT_PREFIX}_hs${HIDDEN}_mb${MICRO_BATCH}" @@ -367,7 +415,7 @@ function setOutput() { printf "Checkpoints will be saved to: %s\n" $(printYellow "${CKPT_DIR}") } -function buildDSconfig() { +buildDSconfig() { # ---- Build DeepSpeed Config --------------------------------- export CPU_OPTIMIZER="${CPU_OPTIMIZER:-0}" export DS_CONFIG="${WORKING_DIR}/ds-configs/ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" @@ -381,13 +429,13 @@ function buildDSconfig() { } -function sumWeights() { +sumWeights() { local file_list=$1 weights=$(cat "${file_list}" | awk '{print $1}' | tr '\n' '\ ,\ ' | sed 's/^/[/g' | sed 's/$/]/g' | tr '\ ' "\,\ ") python3 -c "import numpy as np; print(np.sum(${weights}))" } -function sumFiles() { +sumFiles() { local rd=$1 for f in $("${rd}/*.txt"); do ws=$(sumWeights "${rd}/${f}") @@ -397,46 +445,23 @@ function sumFiles() { ######################################################## # Setup / activate conda environment, +# NOTE: +# +# Jerome's `frameworks_2024_5_v2` seems broken ?? +# - seems to be missing `python3 -c 'from mpi4py import MPI'` ??? +# - consequently, we leave the setup below commented out (for the time +# being): +# if [[ -z "${CONDA_PREFIX-}" ]]; then +# module use -a /home/jmitche1/anl_release/2024/q2 ; module load frameworks_2024_5_v2 +# else +# echo "Caught CONDA_PREFIX=${CONDA_PREFIX}" +# fi ######################################################## setup_conda_sunspot() { ###### check if CONDA_PREFIX non-empty ################ if [[ -z "${CONDA_PREFIX:-}" ]]; then - # eval "$(~/miniconda3/bin/conda shell.zsh hook)" - # conda activate anl_24_q2_release module use /soft/preview-modulefiles/24.086.0 ; module load frameworks/2024.04.15.002.lua fi - # XXX: ------------------------------------------------------------------- - # Jerome's `frameworks_2024_5_v2` seems broken ?? - # - seems to be missing `python3 -c 'from mpi4py import MPI'` ??? - # - consequently, we leave the setup below commented out (for the time - # being): - # if [[ -z "${CONDA_PREFIX-}" ]]; then - # module use -a /home/jmitche1/anl_release/2024/q2 ; module load frameworks_2024_5_v2 - # else - # echo "Caught CONDA_PREFIX=${CONDA_PREFIX}" - # fi - # ------------------------------------------------------------------------ - - ###### check if VIRTUAL_ENV non-empty #################################### - # venvs/anl_24_q2_release/bin/activate - # if [[ -d "${DEFAULT_VENV_PATH}" ]]; then - if [[ -z "${VIRTUAL_ENV:-}" ]]; then - if [[ -n "${CONDA_PREFIX}" ]]; then - VENV_DIR="${WORKING_DIR}/venvs/$(echo ${CONDA_PREFIX} | tr '\/' '\t' | awk '{print $NF}')" - else - VENV_DIR="${WORKING_DIR}/venvs/anl_24_q2_release" - fi - echo "Caught virtual env at ${VENV_DIR}!" - # source "${VENV_DIR}/bin/activate" || - if [[ ! -f "${VENV_DIR}/bin/activate" ]]; then - printf "[!! %s]: Unable to locate %s\n" "$(printRed "ERROR")" "$(printMagenta "${VENV_DIR}/bin/activate")" - # echo "[!ERROR]: Unable to locate ${VENV_DIR}/bin/activate !!" - else - source "${VENV_DIR}/bin/activate" - fi - else - echo "Found existing python at: $(which python3)" - fi } ######################## @@ -453,6 +478,21 @@ setup_conda_sirius() { fi } +######################## +# Setup conda on Polaris +######################## +setup_conda_polaris() { + # unset MPICH_GPU_SUPPORT_ENABLED + ###### check if CONDA_PREFIX non-empty ################ + if [[ -z "${CONDA_PREFIX-}" ]]; then + # if so, load the default conda/2024-04-29 + # module and activate base environment + module use /soft/modulefiles ; module load conda ; conda activate base + else + echo "Caught CONDA_PREFIX=${CONDA_PREFIX}" + fi +} + setup_venv_from_conda() { if [[ -z "${CONDA_PREFIX}" ]]; then echo "No ${CONDA_PREFIX} found." # Exiting." @@ -488,32 +528,6 @@ setup_venv_from_conda() { } -######################## -# Setup conda on Polaris -######################## -setup_conda_polaris() { - # unset MPICH_GPU_SUPPORT_ENABLED - ###### check if CONDA_PREFIX non-empty ################ - if [[ -z "${CONDA_PREFIX-}" ]]; then - # if so, load the default conda/2024-04-29 - # module and activate base environment - module use /soft/modulefiles - module load conda/2024-04-29 ; conda activate base - else - echo "Caught CONDA_PREFIX=${CONDA_PREFIX}" - fi - ###### check if VIRTUAL_ENV non-empty ################# - if [[ -z "${VIRTUAL_ENV:-}" ]]; then - DEFAULT_VENV_PATH=${WORKING_DIR}/venvs/polaris/2024-04-29 - if [[ -d "${DEFAULT_VENV_PATH}" ]]; then - echo "Caught virtual env at ${DEFAULT_VENV_PATH}!" - source "${WORKING_DIR}/venvs/polaris/2024-04-29/bin/activate" - fi - else - echo "Found existing python at: $(which python3)" - fi -} - ########################################################## # Check that we can find the `.py` file we wish to launch ########################################################## @@ -532,12 +546,14 @@ check_executable() { -function setEnv() { - local virtual_env="${VIRTUAL_ENV-}" - local conda_prefix="${CONDA_PREFIX-}" +setEnv() { + local virtual_env="${VIRTUAL_ENV:-}" + local conda_prefix="${CONDA_PREFIX:-}" if [[ -n "${conda_prefix}" && -z "${virtual_env}" ]]; then echo "No virtual environment found." echo "Using conda from: ${conda_prefix}" + echo "Setting up venv from ${CONDA_PROMPT_MODIFIER:-}" + setup_venv_from_conda elif [[ -n "${virtual_env}" && -z "${conda_prefix}" ]]; then echo "No conda found." echo "Using virtual_env from: ${virtual_env}" @@ -589,6 +605,9 @@ function setEnv() { echo "Unable to setup python environment. Exiting" exit 1 fi + if [[ -z "${virtual_env}" ]]; then + setup_venv_from_conda + fi ##################################################################### pystr="Using: $(which python3)" printf "[python] %s" "$(printMagenta ${pystr})" @@ -603,7 +622,7 @@ function setEnv() { # - If so, use this. # - Otherwise, make default HOSTFILEs from "${PBS_NODEFILE}" ###################################################################### -function makeHostfiles() { +makeHostfiles() { if [[ -n "${HOSTFILE}" ]]; then printf "!! USING CUSTOM HOSTFILE FROM: %s" "${HOSTFILE}" else @@ -615,31 +634,12 @@ function makeHostfiles() { # Setup tokenizer as either Llama2 or GPT2 style ################################################## setup_tokenizer_and_data() { - # if [[ ${tok} == Llama* || ${tok} == llama* || ${tok} == LLAMA* ]]; then - # tm="${WORKING_DIR}/ALCF/tokenizer.model" # fallback: Megatron-DeepSpeed/ALCF/tokenizer.model - # export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" # USE TOKENIZER_MODEL from env, else fallback from ^ - # export TOKENIZER_TYPE="Llama2" - # setData - # elif [[ ${tok} == gpt* || ${tok} == GPT* ]]; then - # export DATA_PARENT="${DATA_PARENT:-/gila/Aurora_deployment/foremans/anl_24_q2_release/Megatron-DeepSpeed/dataset}" - # export VOCAB_FILE="${DATA_PARENT}/gpt2-vocab.json" - # export MERGE_FILE="${DATA_PARENT}/gpt2-merges.txt" - # export DATA_PATH="${DATA_PARENT}/BookCorpusDataset_text_document" - # export TOKENIZER_FLAGS="--data-path $DATA_PATH--vocab-file $VOCAB_FILE --merge-file $MERGE_FILE" - # # export TOKENIZER_TYPE="${TOKENIZER_TYPE:-GPT2}" - # export TOKENIZER_TYPE="GPT2" - # else - # echo "Unknown tokenizer ${tok} passed" - # fi if [[ "$#" == 1 ]]; then tok="$1" dfl="${DATA_FILE_LIST:-}" - # echo "Setting up tokenizer with ${tok}" - # elif [[ "$#" -ne 2 ]]; then elif [[ "$#" == 2 ]]; then tok="$1" dfl="$2" - # tok="${TOKENIZER_TYPE:-Llama2}" else echo "Incorrect number of arguments passed. Received: $#, expected 2" fi @@ -648,14 +648,18 @@ setup_tokenizer_and_data() { if [[ ${tok} == gpt* || ${tok} == GPT* ]]; then export TOKENIZER_TYPE="GPT2" export TOKENIZER_FLAGS="--tokenizer-type GPT2BPETokenizer" - export DATA_PARENT="${DATA_PARENT:-/gila/Aurora_deployment/foremans/anl_24_q2_release/Megatron-DeepSpeed/dataset}" + local machine=$(where_am_i) + if [[ ${machine} == "polaris" ]]; then + export DATA_PARENT="${DATA_PARENT:-/eagle/argonne_tpc/foremans/projects/argonne-lcf/Megatron-DeepSpeed/dataset}" + elif [[ ${machine} == "sunspot" ]]; then + export DATA_PARENT="${DATA_PARENT:-/gila/Aurora_deployment/foremans/anl_24_q2_release/Megatron-DeepSpeed/dataset}" + else + export DATA_PARENT="${DATA_PARENT:-${WORKING_DIR}/dataset}" + fi export VOCAB_FILE="${DATA_PARENT}/gpt2-vocab.json" export MERGE_FILE="${DATA_PARENT}/gpt2-merges.txt" export DATA_PATH="${DATA_PARENT}/BookCorpusDataset_text_document" - # TOKENIZER_FLAGS="--data-path $DATA_PATH--vocab-file $VOCAB_FILE --merge-file ${MERGE_FILE}" export DATA_FLAGS="--data-path ${DATA_PATH} --vocab-file ${VOCAB_FILE} --merge-file ${MERGE_FILE}" - # export TOKENIZER_TYPE="${TOKENIZER_TYPE:-GPT2}" - # else [[ ${tok} == Llama* || ${tok} == llama* || ${tok} == LLAMA* ]]; then else export DATA_FLAGS="" export TOKENIZER_TYPE="Llama2" @@ -665,19 +669,10 @@ setup_tokenizer_and_data() { if [[ "${TOKENIZER_TYPE}" != "GPT2" ]]; then echo "Using tokenizer: ${TOKENIZER_TYPE}. Setting up data with ${DATA_FILE_LIST-}" setData "${dfl}" || exit - # setData "${DATA_FILE_LIST-}" || exit # 09. Specify `DATA_FILE_LIST` for dolma dataset fi - # --tokenizer-model ${TOKENIZER_MODEL} \ fi - # export DATA_FLAGS="${DATA_FLAGS}" - # export TOKENIZER_TYPE="${TOKENIZER_TYPE}" - # export TOKENIZER_FLAGS="${TOKENIZER_FLAGS}" printf "[setData] DATA_FLAGS: %s\n" "$(printGreen ${DATA_FLAGS})" printf "[setData] TOKENIZER_FLAGS: %s\n" "$(printMagenta ${TOKENIZER_FLAGS})" - # if [[ "${TOKENIZER_TYPE}" != "GPT2" ]]; then - # echo "Using tokenizer: ${TOKENIZER_TYPE}. Setting up data with ${DATA_FILE_LIST-}" - # setData "${DATA_FILE_LIST-}" || exit # 09. Specify `DATA_FILE_LIST` for dolma dataset - # fi } @@ -686,7 +681,7 @@ setup_tokenizer_and_data() { # Ensure `DATA_FILE_LIST` is set, # fallback to default values if necessary. ############################################### -function setData() { # ------------------------[dfl: abbrv. for DATA_FILE_LIST] +setData() { # ------------------------[dfl: abbrv. for DATA_FILE_LIST] # if [[ "$#" -ne 1 ]]; then # tok="${TOKENIZER_TYPE:-Llama2}" # else @@ -764,7 +759,7 @@ function setData() { # ------------------------[dfl: abbrv. for DATA_FILE_LIST] # printf "[setData] TOKENIZER_FLAGS: %s\n" "$(printMagenta ${TOKENIZER_FLAGS})" } -function generateDSconfig() { +generateDSconfig() { for v in "$GLOBAL_BATCH" "$MICRO_BATCH" "$GRAD_ACC_STEPS" "$ZERO_STAGE" \ "$PP" "$DTYPE" do @@ -926,35 +921,35 @@ $flops_profiler EOT } -function printBlack() { +printBlack() { printf "\e[1;30m%s\e[0m\n" "$@" } -function printRed() { +printRed() { printf "\e[1;31m%s\e[0m\n" "$@" } -function printGreen() { +printGreen() { printf "\e[1;32m%s\e[0m\n" "$@" } -function printYellow() { +printYellow() { printf "\e[1;33m%s\e[0m\n" "$@" } -function printBlue() { +printBlue() { printf "\e[1;34m%s\e[0m\n" "$@" } -function printMagenta() { +printMagenta() { printf "\e[1;35m%s\e[0m\n" "$@" } -function printCyan() { +printCyan() { printf "\e[1;36m%s\e[0m\n" "$@" } -function printWhite() { +printWhite() { printf "\e[1;37m%s\e[0m\n" "$@" } From 3cd3f1a9a695341643dd29c87a47969de90a4812 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 24 May 2024 08:22:48 -0500 Subject: [PATCH 264/268] Update `train_aGPT_7B.sh` --- train_aGPT_7B.sh | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/train_aGPT_7B.sh b/train_aGPT_7B.sh index e5035ae27b..72322d655c 100644 --- a/train_aGPT_7B.sh +++ b/train_aGPT_7B.sh @@ -1,5 +1,5 @@ #!/bin/bash --login -# + NOW="$(date "+%Y-%m-%d-%H%M%S")" cd "${PBS_O_WORKDIR}" || exit @@ -7,30 +7,13 @@ cd "${PBS_O_WORKDIR}" || exit HOSTNAME=$(hostname) if [[ "${HOSTNAME}" == x3* ]]; then MACHINE="polaris" - # XXX: - # - On Polaris, we see that: - # - on 1 or 2 nodes, only MICRO_BATCH=1 will fit in memory - # - on 8 nodes, MICRO_BATCH=2 will fit in memory - # - on 48 nodes, MICRO_BATCH=4 will fit in memory - nhosts=$(wc -l < "${PBS_NODEFILE}") - if [[ "${nhosts}" == 1 ]]; then - export MBS=1 - elif [[ "${nhosts}" == 2 ]]; then - export MBS=1 - elif [[ "${nhosts}" -ge 3 ]]; then - export MBS=2 - elif [[ "${nhosts}" -ge 8 ]]; then - export MBS=4 - fi elif [[ "${HOSTNAME}" == x1* ]]; then MACHINE="sunspot" elif [[ "${HOSTNAME}" == x4* ]]; then MACHINE="aurora" fi -export nhosts -OUTDIR="${PBS_O_WORKDIR}/pbslogs" -mkdir -p "${OUTDIR}" +OUTDIR="${PBS_O_WORKDIR}/pbslogs" && mkdir -p "${OUTDIR}" OUTFILE="${OUTDIR}/${PBS_JOBID}-${NOW}.log" echo "+---------------------------------------------------------+" From bc1dbfdd7943b4513c9a167d934ae0e5e1d5e171 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Fri, 24 May 2024 13:59:39 -0500 Subject: [PATCH 265/268] Fix `--data-cache-path` in `ALCF/helpers.sh, train_llama_alcf.sh` --- ALCF/helpers.sh | 37 +++++++++++++++++++++++-------------- train_llama_alcf.sh | 5 +++-- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index a9af22416d..c0927766f2 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -135,12 +135,13 @@ loadCondaEnv() { setupLauncher() { # outdir=$1 - if [[ -n "${DIST_LAUNCH}" && ${LAUNCH_CMD:-"MPICH"} != "deepspeed" ]]; then - export LAUNCHER="${DIST_LAUNCH} --genvall --cpu-bind depth -d 16 $(which python3) -Wignore ${EXEC}" - else + if [[ "${LAUNCH_CMD:-"MPICH"}" == "deepspeed" ]]; then # Assert `./hostfile_deepspeed` exists export hfds="${WORKING_DIR}/hostfile_deepspeed" && [ -f "${hfds}" ] || exit export LAUNCHER="deepspeed --hostfile $hfds --launcher MPICH ${EXEC}" + # if [[ -n "${DIST_LAUNCH}" && ${LAUNCH_CMD:-"MPICH"} != "deepspeed" ]]; then + else + export LAUNCHER="${DIST_LAUNCH} --genvall --cpu-bind depth -d 16 $(which python3) -Wignore ${EXEC}" fi printf "Launching with: %s\n" "$(printRed "${LAUNCH_CMD}")" printf " %s" "$(printMagenta ${LAUNCHER})" @@ -390,21 +391,27 @@ saveDSenv() { } > .deepspeed_env } -setOutput() { + +get_output_prefix() { # ---- Specify output location -------------------------------- - OUTPUT_PREFIX="ws${WORLD_SIZE}_ds_stage${ZERO_STAGE}_nl${NLAYERS}" - OUTPUT_PREFIX="${OUTPUT_PREFIX}_hs${HIDDEN}_mb${MICRO_BATCH}" - OUTPUT_PREFIX="${OUTPUT_PREFIX}_seq${SEQ}_gb${GLOBAL_BATCH}" - OUTPUT_PREFIX="${OUTPUT_PREFIX}_pp${PP}_tp${TP}_${DTYPE}_opt${OPT}" - OUTPUT_PREFIX="${OUTPUT_PREFIX}_lr${LR}_lwf${LR_WARMUP_FRAC}" + pre="ws${WORLD_SIZE}_ds_stage${ZERO_STAGE}_nl${NLAYERS}" + pre="${pre}_hs${HIDDEN}_mb${MICRO_BATCH}" + pre="${pre}_seq${SEQ}_gb${GLOBAL_BATCH}" + pre="${pre}_pp${PP}_tp${TP}_${DTYPE}_opt${OPT}" + pre="${pre}_lr${LR}_lwf${LR_WARMUP_FRAC}" if [[ -n "${LR_DECAY_ITERS}" ]]; then - OUTPUT_PREFIX="${OUTPUT_PREFIX}_ldi${LR_DECAY_ITERS}" + pre="${pre}_ldi${LR_DECAY_ITERS}" fi if [[ -z "${NO_FLASH_ATTN:-}" ]]; then - OUTPUT_PREFIX="${OUTPUT_PREFIX}_flash" + pre="${pre}_flash" fi - export OUTPUT_PREFIX="${OUTPUT_PREFIX}" + export OUTPUT_PREFIX="${pre}" + echo "${pre}" +} + +setOutput() { # OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%m%d%H%M%S)_${HOSTNAME}" + OUTPUT_PREFIX=$(get_output_prefix) OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%Y%m%d-%H%M%S)_${WORLD_SIZE}_${HOSTNAME}" export OUTPUT_DIR="${OUTPUT_DIR}" export OUTPUT_LOG="${OUTPUT_DIR}/output.log" @@ -665,7 +672,7 @@ setup_tokenizer_and_data() { export TOKENIZER_TYPE="Llama2" tm="${WORKING_DIR}/ALCF/tokenizer.model" # fallback: Megatron-DeepSpeed/ALCF/tokenizer.model export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" # USE TOKENIZER_MODEL from env, else fallback from ^ - export TOKENIZER_FLAGS="${TOKENIZER_FLAGS} --tokenizer-type Llama2Tokenizer --tokenizer-model ${TOKENIZER_MODEL}" + export TOKENIZER_FLAGS="--tokenizer-type Llama2Tokenizer --tokenizer-model ${TOKENIZER_MODEL}" if [[ "${TOKENIZER_TYPE}" != "GPT2" ]]; then echo "Using tokenizer: ${TOKENIZER_TYPE}. Setting up data with ${DATA_FILE_LIST-}" setData "${dfl}" || exit @@ -737,13 +744,14 @@ setData() { # ------------------------[dfl: abbrv. for DATA_FILE_LIST] ndocs=$(wc -l < "${dfl}") ws=$(sumWeights "${dfl}") dfl_stem=$(echo "${dfl}" | tr "\/" "\t" | awk '{print $NF}' | sed "s/\.txt//g") + # dcp="${OUTPUT_PREFIX:-$(get_output_prefix)}/.cache/${dfl_stem}/index-cache" dcp=".cache/${dfl_stem}/index-cache" export DATA_FILE_LIST="${dfl}" export NUM_DOCS="${ndocs}" export WEIGHT_SUM="${ws}" export DFL_STEM="${dfl_stem}" export DATA_CACHE_PATH="${dcp}" - export DATA_FLAGS="${DATA_FLAGS} --data-file-list ${DATA_FILE_LIST} --data-cache-path ${DATA_CACHE_PATH}" + export DATA_FLAGS="${DATA_FLAGS} --data-file-list ${DATA_FILE_LIST}" # --data-cache-path ${DATA_CACHE_PATH}" echo "--------------------" echo "Updated environment:" printf "DATA_FILE_LIST: %s\n" "${DATA_FILE_LIST}" @@ -751,6 +759,7 @@ setData() { # ------------------------[dfl: abbrv. for DATA_FILE_LIST] printf "WEIGHT_SUM: %s\n" "${WEIGHT_SUM}" printf "DFL_STEM: %s\n" "${DFL_STEM}" printf "DATA_CACHE_PATH: %s\n" "${DATA_CACHE_PATH}" + printf "DATA_FLAGS: %s\n" "${DATA_FLAGS}" echo "--------------------" # fi # export DATA_FLAGS="${DATA_FLAGS}" diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index 9a6d39923c..bf346e2144 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -95,6 +95,7 @@ TBDIR="${CKPT_DIR}/tensorboard" mkdir -p "${TBDIR}" data_cache_path="${CKPT_DIR}/${DATA_CACHE_PATH}" && mkdir -p "${data_cache_path}" +echo "Using data_cache_path: ${data_cache_path}" export DEFAULTS="\ @@ -116,7 +117,6 @@ custom_args=" $@" # --log-memory-to-tensorboard \ # --data-file-list ${DATA_FILE_LIST} \ # --data-file-list ${DATA_FILE_LIST} \ - # --data-cache-path ${data_cache_path} \ # --tokenizer-type Llama2Tokenizer \ # --tokenizer-model ${TOKENIZER_MODEL} \ run_cmd=" @@ -143,10 +143,11 @@ run_cmd=" --pipeline-model-parallel-size ${PP} \ --num-key-value-heads ${NUM_KV_HEAD} \ --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + --data-cache-path ${data_cache_path} \ + ${DATA_FLAGS} \ ${LR_ARGS} \ ${LLAMA_ARGS} \ ${TIMING_STR} \ - ${DATA_FLAGS} \ ${TOKENIZER_FLAGS} \ $ds_args \ ${gpt_args[*]} \ From c3a4451f646b3ef17a1a8f2d371ae159974c4ca3 Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sat, 25 May 2024 11:21:00 -0500 Subject: [PATCH 266/268] Add `ALCF/sunspot-env-2024-04-15-002.sh` --- ALCF/sunspot-env-2024-04-15-002.sh | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 ALCF/sunspot-env-2024-04-15-002.sh diff --git a/ALCF/sunspot-env-2024-04-15-002.sh b/ALCF/sunspot-env-2024-04-15-002.sh new file mode 100644 index 0000000000..3b7155675d --- /dev/null +++ b/ALCF/sunspot-env-2024-04-15-002.sh @@ -0,0 +1,4 @@ +#!/bin/bash --login + +module use /soft/preview-modulefiles/24.086.0 +module load frameworks/2024.04.15.002.lua From 0fc3919d1587adc3b13b4afe2f9e1b90cbd6874f Mon Sep 17 00:00:00 2001 From: Sam Foreman Date: Sat, 25 May 2024 11:22:03 -0500 Subject: [PATCH 267/268] Update `train_aGPT_7B.sh` --- train_aGPT_7B.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_aGPT_7B.sh b/train_aGPT_7B.sh index 72322d655c..9dc0f1d946 100644 --- a/train_aGPT_7B.sh +++ b/train_aGPT_7B.sh @@ -25,6 +25,6 @@ echo "+---------------------------------------------------------+" export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=6000 echo "${OUTFILE}" >> "${OUTDIR}/latest" # export DEBUG=1 -export MICRO_BATCH="${MICRO_BATCH:-${MBS}}" +# export MICRO_BATCH="${MICRO_BATCH:-${MBS}}" export DATA_FILE_LIST="${DATA_FILE_LIST:-${PBS_O_WORKDIR}/ALCF/data-lists/${MACHINE}/dolma_v1_7_file_list.txt}" bash "${PBS_O_WORKDIR}/train_llama_alcf.sh" |& tee "${OUTFILE}" From 2b5b41f7acb8bb685e7e7f794af12ef6af317674 Mon Sep 17 00:00:00 2001 From: Varuni Sastry <88804132+vksastry@users.noreply.github.com> Date: Thu, 30 May 2024 22:29:53 -0500 Subject: [PATCH 268/268] convert MDS checkpoint to Hf Llama model --- ALCF/mds_to_hf.py | 91 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 ALCF/mds_to_hf.py diff --git a/ALCF/mds_to_hf.py b/ALCF/mds_to_hf.py new file mode 100644 index 0000000000..a336788274 --- /dev/null +++ b/ALCF/mds_to_hf.py @@ -0,0 +1,91 @@ +# Usage : python mds_to_hf.py --mds_checkpoint --output_dir +# Tips : Do not run on login node. +# This script currently only takes care of tp=1. Takes a AuroraGPT Llama model trained with Megatron-DeepSpeed and converts to LLamaCausalForLM architecture from HuggingFace. + +import argparse +import torch +import pdb +import os +from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer + +def repeat_kv_wt(x,np): + return torch.repeat_interleave(x, dim=0, repeats=np) + +def Update_llama_config(Llama_config, mds_args): + if mds_args['swiglu']: + Llama_config.hidden_act = "silu" + Llama_config.hidden_size = mds_args['hidden_size'] + Llama_config.intermediate_size = mds_args['ffn_hidden_size'] + Llama_config.max_position_embeddings = mds_args['max_position_embeddings'] + Llama_config.num_attention_heads = mds_args['num_attention_heads'] + Llama_config.num_hidden_layers = mds_args['num_layers'] + Llama_config.num_key_value_heads = mds_args['num_key_value_heads'] + Llama_config.rms_norm_eps = mds_args['layernorm_epsilon'] + Llama_config.rope_theta = mds_args['rope_theta'] + Llama_config.vocab_size = mds_args['padded_vocab_size'] + return Llama_config + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--mds_checkpoint', required=True) + parser.add_argument('--output_dir', required=True) + args = parser.parse_args() + + # make output_dir if it does not exits. + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + filename = str(args.mds_checkpoint) + if not filename.split("/")[-1].startswith('mp_rank') and not filename.split("/")[-1].endswith('.pt'): + assert ("Provide the right file path, The file should be of format mp_rank_*.pt") + print(f"loading mds checkpoint {filename}") + + mds_model = torch.load(args.mds_checkpoint,map_location=torch.device('cpu')) + Llama_model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf",cache_dir='/eagle/datascience/vsastry/huggingface') + + Llama_config = Llama_model.config + Updated_Llama_config = Update_llama_config(Llama_config, mds_model['args'].__dict__) + # save the updated config.json file + Updated_Llama_config.to_json_file(os.path.join(args.output_dir,'config.json')) + + state_dict = {} + dim = mds_model['args'].__dict__['kv_channels'] + inv_freq = 1.0 / (mds_model['args'].__dict__['rope_theta'] ** (torch.arange(0,dim, 2).float() / dim)) + hidden_size = mds_model['args'].__dict__['hidden_size'] + kv_dim = mds_model['args'].__dict__['kv_channels'] * mds_model['args'].__dict__['num_key_value_heads'] + kv_groups = mds_model['args'].__dict__['num_attention_heads'] // mds_model['args'].__dict__['num_key_value_heads'] + for layer_i in range(Updated_Llama_config.__dict__['num_hidden_layers']): + # SELF ATTENTION layers. + # get the q, k, v weights separately. Keeping k and v at the GQA head dim, since the transformers/models/llama/modelling_utils will take care of it. + fused_qkv = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.self_attention.query_key_value.weight"] + state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = fused_qkv[0:hidden_size] + state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = fused_qkv[hidden_size:hidden_size+kv_dim] + #state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = repeat_kv_wt(fused_qkv[hidden_size:hidden_size+kv_dim], kv_groups) + state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = fused_qkv[hidden_size+kv_dim:hidden_size+2*kv_dim] + #state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = repeat_kv_wt(fused_qkv[hidden_size+kv_dim:hidden_size+2*kv_dim],kv_groups) + state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.self_attention.dense.weight"] + + # MLP Layers + fused_mlp = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.mlp.dense_h_to_4h.weight"] + chunked_mlp = torch.chunk(fused_mlp,2,dim=0) + state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = chunked_mlp[0] + state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = chunked_mlp[1] + state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.mlp.dense_4h_to_h.weight"] + + #LayerNorm weights and RoPe + state_dict[f"model.layers.{layer_i}.input_layernorm.weight"] = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.input_layernorm.weight"] + state_dict[f"model.layers.{layer_i}.post_attention_layernorm.weight"] = mds_model['module']['language_model']['encoder'][f"layers.{layer_i}.post_attention_layernorm.weight"] + + state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq + + # Get the non-encoder layer weights. + state_dict["model.embed_tokens.weight"] = mds_model['module']['language_model']['embedding']['word_embeddings']['weight'] + state_dict["model.norm.weight"] = mds_model['module']['language_model']['encoder']['final_layernorm.weight'] + state_dict["lm_head.weight"] = mds_model['module']['language_model']['output_layer']['weight'] + + # Save the model in the hf output path. + torch.save(state_dict, os.path.join(args.output_dir,"pytorch_model.bin")) + + +